Skip to content

Commit e41af6e

Browse files
committed
Add WhatsHap phasing step
1 parent ac5fa20 commit e41af6e

2 files changed

Lines changed: 175 additions & 0 deletions

File tree

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisModule.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@
183183
import org.labkey.sequenceanalysis.run.variant.VariantFiltrationStep;
184184
import org.labkey.sequenceanalysis.run.variant.VariantQCStep;
185185
import org.labkey.sequenceanalysis.run.variant.VariantsToTableStep;
186+
import org.labkey.sequenceanalysis.run.variant.WhatsHapStep;
186187
import org.labkey.sequenceanalysis.util.Barcoder;
187188
import org.labkey.sequenceanalysis.util.ChainFileValidator;
188189
import org.labkey.sequenceanalysis.util.ScatterGatherUtils;
@@ -374,6 +375,7 @@ public static void registerPipelineSteps()
374375
SequencePipelineService.get().registerPipelineStep(new BcftoolsFixploidyStep.Provider());
375376
SequencePipelineService.get().registerPipelineStep(new BcftoolsFillFromFastaStep.Provider());
376377
SequencePipelineService.get().registerPipelineStep(new SVAnnotateStep.Provider());
378+
SequencePipelineService.get().registerPipelineStep(new WhatsHapStep.Provider());
377379

378380
//handlers
379381
SequenceAnalysisService.get().registerFileHandler(new LiftoverHandler());
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package org.labkey.sequenceanalysis.run.variant;
2+
3+
import htsjdk.samtools.util.Interval;
4+
import htsjdk.variant.vcf.VCFFileReader;
5+
import htsjdk.variant.vcf.VCFHeader;
6+
import org.apache.logging.log4j.Logger;
7+
import org.jetbrains.annotations.Nullable;
8+
import org.labkey.api.data.Container;
9+
import org.labkey.api.data.SimpleFilter;
10+
import org.labkey.api.data.TableInfo;
11+
import org.labkey.api.data.TableSelector;
12+
import org.labkey.api.pipeline.PipelineJob;
13+
import org.labkey.api.pipeline.PipelineJobException;
14+
import org.labkey.api.query.FieldKey;
15+
import org.labkey.api.query.QueryService;
16+
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
17+
import org.labkey.api.sequenceanalysis.SequenceOutputFile;
18+
import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
19+
import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
20+
import org.labkey.api.sequenceanalysis.pipeline.PipelineStepProvider;
21+
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
22+
import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport;
23+
import org.labkey.api.sequenceanalysis.pipeline.SequencePipelineService;
24+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStep;
25+
import org.labkey.api.sequenceanalysis.pipeline.VariantProcessingStepOutputImpl;
26+
import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep;
27+
import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper;
28+
import org.labkey.api.util.FileUtil;
29+
import org.labkey.api.util.PageFlowUtil;
30+
import org.labkey.sequenceanalysis.SequenceAnalysisSchema;
31+
32+
import java.io.File;
33+
import java.io.IOException;
34+
import java.util.ArrayList;
35+
import java.util.Collections;
36+
import java.util.List;
37+
38+
public class WhatsHapStep extends AbstractCommandPipelineStep<WhatsHapStep.WhatsHapWrapper> implements VariantProcessingStep
39+
{
40+
public WhatsHapStep(PipelineStepProvider<?> provider, PipelineContext ctx)
41+
{
42+
super(provider, ctx, new WhatsHapStep.WhatsHapWrapper(ctx.getLogger()));
43+
}
44+
45+
public static class Provider extends AbstractVariantProcessingStepProvider<WhatsHapStep> implements SupportsPedigree
46+
{
47+
public Provider()
48+
{
49+
super("WhatsHap", "WhatsHap", "", "This will run WhatsHap to phase the VCF using BAM/CRAM data", List.of(
50+
51+
), null, "https://whatshap.readthedocs.io/en/latest/");
52+
}
53+
54+
@Override
55+
public WhatsHapStep create(PipelineContext ctx)
56+
{
57+
return new WhatsHapStep(this, ctx);
58+
}
59+
}
60+
61+
@Override
62+
public void init(PipelineJob job, SequenceAnalysisJobSupport support, List<SequenceOutputFile> inputFiles) throws PipelineJobException
63+
{
64+
if (inputFiles.size() != 1)
65+
{
66+
throw new PipelineJobException("This step expects a single VCF as input");
67+
}
68+
69+
// look up BAM/CRAMs:
70+
for (SequenceOutputFile so : inputFiles)
71+
{
72+
List<String> samples;
73+
try (VCFFileReader reader = new VCFFileReader(so.getFile()))
74+
{
75+
VCFHeader header = reader.getFileHeader();
76+
samples = header.getSampleNamesInOrder();
77+
}
78+
79+
if (samples.isEmpty())
80+
{
81+
throw new IllegalStateException("No samples found in VCF file");
82+
}
83+
84+
ArrayList<Long> toCache = new ArrayList<>();
85+
Container targetContainer = getPipelineCtx().getJob().getContainer().isWorkbookOrTab() ? getPipelineCtx().getJob().getContainer().getParent() : getPipelineCtx().getJob().getContainer();
86+
TableInfo outputFiles = QueryService.get().getUserSchema(getPipelineCtx().getJob().getUser(), targetContainer, SequenceAnalysisSchema.SCHEMA_NAME).getTable(SequenceAnalysisSchema.TABLE_OUTPUTFILES);
87+
for (String sample : samples)
88+
{
89+
// Find readsets for this genome:
90+
SimpleFilter filter1 = new SimpleFilter(FieldKey.fromString("readset/name"), sample).
91+
addCondition(FieldKey.fromString("library_id"), so.getLibrary_id()).
92+
addCondition(FieldKey.fromString("category"), "Alignment");
93+
94+
List<Integer> alignments = new TableSelector(outputFiles, PageFlowUtil.set("rowid"), filter1, null).getArrayList(Integer.class);
95+
if (alignments.isEmpty())
96+
{
97+
throw new PipelineJobException("Unable to find alignment for: " + sample);
98+
}
99+
100+
SequenceOutputFile alignmentFile = SequenceOutputFile.getForId(Collections.max(alignments));
101+
toCache.add(alignmentFile.getDataId());
102+
support.cacheExpData(alignmentFile.getExpData());
103+
}
104+
105+
support.cacheObject(CACHE_KEY, toCache);
106+
}
107+
}
108+
109+
private final String CACHE_KEY = "~cached_readsets~";
110+
111+
private List<File> getCachedBams() throws PipelineJobException
112+
{
113+
List<Long> cachedFiles = getPipelineCtx().getSequenceSupport().getCachedObject(CACHE_KEY, PipelineJob.createObjectMapper().getTypeFactory().constructParametricType(List.class, Long.class));
114+
115+
return cachedFiles.stream().map(x -> getPipelineCtx().getSequenceSupport().getCachedData(x)).toList();
116+
}
117+
118+
@Override
119+
public Output processVariants(File inputVCF, File outputDirectory, ReferenceGenome genome, @Nullable List<Interval> intervals) throws PipelineJobException
120+
{
121+
VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl();
122+
123+
output.addInput(inputVCF, "Input VCF");
124+
output.addInput(genome.getWorkingFastaFile(), "Reference Genome");
125+
126+
File vcfOut = FileUtil.appendName(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".phased.vcf.gz");
127+
128+
List<String> args = new ArrayList<>();
129+
args.add(getWrapper().getExe().getPath());
130+
args.add("-o");
131+
args.add(vcfOut.getPath());
132+
args.add("--reference");
133+
args.add(genome.getWorkingFastaFile().getPath());
134+
args.add(inputVCF.getPath());
135+
for (File f : getCachedBams())
136+
{
137+
args.add(f.getPath());
138+
}
139+
140+
getWrapper().execute(args);
141+
142+
if (!vcfOut.exists())
143+
{
144+
throw new PipelineJobException("Missing file: " + vcfOut.getPath());
145+
}
146+
147+
try
148+
{
149+
SequenceAnalysisService.get().ensureVcfIndex(vcfOut, getPipelineCtx().getLogger());
150+
}
151+
catch (IOException e)
152+
{
153+
throw new PipelineJobException(e);
154+
}
155+
156+
output.addSequenceOutput(vcfOut, "Phased VCF: " + inputVCF.getName(), "Phased VCF", null, null, genome.getGenomeId(), null);
157+
158+
return output;
159+
}
160+
161+
public static class WhatsHapWrapper extends AbstractCommandWrapper
162+
{
163+
public WhatsHapWrapper(@Nullable Logger logger)
164+
{
165+
super(logger);
166+
}
167+
168+
public File getExe()
169+
{
170+
return SequencePipelineService.get().getExeForPackage("WHATSHAPPATH", "whatshap");
171+
}
172+
}
173+
}

0 commit comments

Comments
 (0)