Skip to content

Commit cc27375

Browse files
committed
Improve KING/plink2 implementation
1 parent 02e696e commit cc27375

File tree

3 files changed

+82
-136
lines changed

3 files changed

+82
-136
lines changed

SequenceAnalysis/api-src/org/labkey/api/sequenceanalysis/pipeline/BcftoolsRunner.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22

33
import org.apache.logging.log4j.Logger;
44
import org.jetbrains.annotations.Nullable;
5+
import org.labkey.api.pipeline.PipelineJobException;
6+
import org.labkey.api.pipeline.PipelineJobService;
57
import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper;
68

79
import java.io.File;
10+
import java.util.ArrayList;
11+
import java.util.List;
812

913
/**
1014
* User: bimber
@@ -22,4 +26,33 @@ public static File getBcfToolsPath()
2226
{
2327
return SequencePipelineService.get().getExeForPackage("BCFTOOLSPATH", "bcftools");
2428
}
29+
30+
public static boolean isBcftoolsFound()
31+
{
32+
return BcftoolsRunner.resolveFileInPath("bcftools", null, false) != null;
33+
}
34+
35+
public void doIndex(File vcf) throws PipelineJobException
36+
{
37+
List<String> args = new ArrayList<>();
38+
args.add(getBcfToolsPath().getAbsolutePath());
39+
args.add("index");
40+
args.add("-t");
41+
args.add("-f");
42+
args.add("-n");
43+
44+
if (!PipelineJobService.get().isWebServer())
45+
{
46+
Integer threads = SequencePipelineService.get().getMaxThreads(getLogger());
47+
if (threads != null)
48+
{
49+
args.add("--threads");
50+
args.add(String.valueOf(threads));
51+
}
52+
}
53+
54+
args.add(vcf.getAbsolutePath());
55+
56+
execute(args);
57+
}
2558
}

SequenceAnalysis/src/org/labkey/sequenceanalysis/SequenceAnalysisServiceImpl.java

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
3737
import org.labkey.api.sequenceanalysis.SequenceDataProvider;
3838
import org.labkey.api.sequenceanalysis.model.Readset;
39+
import org.labkey.api.sequenceanalysis.pipeline.BcftoolsRunner;
3940
import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome;
4041
import org.labkey.api.sequenceanalysis.pipeline.SamtoolsCramConverter;
4142
import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler;
@@ -267,8 +268,16 @@ public File ensureVcfIndex(File vcf, Logger log, boolean forceRecreate) throws I
267268
//note: there is a bug in htsjdk's index creation with gz inputs
268269
if (gz.isType(vcf) && !SystemUtils.IS_OS_WINDOWS)
269270
{
270-
TabixRunner r = new TabixRunner(log);
271-
r.execute(vcf);
271+
// preferentially use bcftools since it supports multithreading:
272+
if (BcftoolsRunner.isBcftoolsFound())
273+
{
274+
new BcftoolsRunner(log).doIndex(vcf);
275+
}
276+
else
277+
{
278+
new TabixRunner(log).execute(vcf);
279+
}
280+
272281
if (!expectedIdx.exists())
273282
{
274283
throw new PipelineJobException("Expected index was not created: " + expectedIdx.getPath());

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/variant/KingInferenceStep.java

Lines changed: 38 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package org.labkey.sequenceanalysis.run.variant;
22

3+
import com.google.common.io.Files;
34
import htsjdk.samtools.SAMSequenceDictionary;
45
import htsjdk.samtools.SAMSequenceRecord;
56
import htsjdk.samtools.util.Interval;
@@ -10,10 +11,7 @@
1011
import org.apache.logging.log4j.Logger;
1112
import org.jetbrains.annotations.Nullable;
1213
import org.json.JSONObject;
13-
import org.labkey.api.collections.CaseInsensitiveHashMap;
1414
import org.labkey.api.pipeline.PipelineJobException;
15-
import org.labkey.api.reader.Readers;
16-
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
1715
import org.labkey.api.sequenceanalysis.pipeline.AbstractVariantProcessingStepProvider;
1816
import org.labkey.api.sequenceanalysis.pipeline.PedigreeToolParameterDescriptor;
1917
import org.labkey.api.sequenceanalysis.pipeline.PipelineContext;
@@ -26,18 +24,11 @@
2624
import org.labkey.api.sequenceanalysis.run.AbstractCommandPipelineStep;
2725
import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper;
2826
import org.labkey.api.util.Compress;
29-
import org.labkey.api.writer.PrintWriters;
30-
import org.labkey.sequenceanalysis.pipeline.ProcessVariantsHandler;
3127

32-
import java.io.BufferedReader;
3328
import java.io.File;
3429
import java.io.IOException;
35-
import java.io.PrintWriter;
3630
import java.util.ArrayList;
37-
import java.util.Arrays;
38-
import java.util.HashMap;
3931
import java.util.List;
40-
import java.util.Map;
4132

4233
public class KingInferenceStep extends AbstractCommandPipelineStep<KingInferenceStep.KingWrapper> implements VariantProcessingStep
4334
{
@@ -50,7 +41,7 @@ public static class Provider extends AbstractVariantProcessingStepProvider<KingI
5041
{
5142
public Provider()
5243
{
53-
super("KingInferenceStep", "KING/Relatedness", "", "This will run KING to infer kinship from a VCF", List.of(
44+
super("KingInferenceStep", "KING/Relatedness", "", "This will run KING (via plink2) to infer kinship from a VCF", List.of(
5445
ToolParameterDescriptor.create("limitToChromosomes", "Limit to Chromosomes", "If checked, the analysis will include only the primary chromosomes", "checkbox", new JSONObject()
5546
{{
5647
put("checked", true);
@@ -148,7 +139,24 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
148139
plinkArgs1.add("--out");
149140
plinkArgs1.add(plinkOut.getPath());
150141

151-
plink.execute(plinkArgs1);
142+
File doneFile = new File (plinkOut.getPath() + ".done");
143+
output.addIntermediateFile(doneFile);
144+
if (doneFile.exists())
145+
{
146+
getPipelineCtx().getLogger().debug("plink has completed, will not repeat");
147+
}
148+
else {
149+
plink.execute(plinkArgs1);
150+
151+
try
152+
{
153+
Files.touch(doneFile);
154+
}
155+
catch (IOException e)
156+
{
157+
throw new PipelineJobException(e);
158+
}
159+
}
152160

153161
File plinkOutBed = new File(plinkOut.getPath() + ".bed");
154162
if (!plinkOutBed.exists())
@@ -163,7 +171,23 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
163171
plinkArgs2.add("--out");
164172
plinkArgs2.add(plinkOutKing.getPath());
165173

166-
plink.execute(plinkArgs2);
174+
doneFile = new File (plinkOutKing.getPath() + ".done");
175+
if (doneFile.exists())
176+
{
177+
getPipelineCtx().getLogger().debug("plink has completed, will not repeat");
178+
}
179+
else {
180+
plink.execute(plinkArgs2);
181+
182+
try
183+
{
184+
Files.touch(doneFile);
185+
}
186+
catch (IOException e)
187+
{
188+
throw new PipelineJobException(e);
189+
}
190+
}
167191

168192
File plinkOutKingFile = new File(plinkOutKing.getPath() + ".kin0");
169193
if (!plinkOutKingFile.exists())
@@ -188,131 +212,11 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
188212
throw new PipelineJobException(e);
189213
}
190214

191-
output.addSequenceOutput(plinkOutKingFileTxt, "PLINK2 Relatedness: " + inputVCF.getName(), "PLINK2 Kinship", null, null, genome.getGenomeId(), "Total lines: " + lineCount);
192-
193-
// Also with KING:
194-
KingWrapper wrapper = new KingWrapper(getPipelineCtx().getLogger());
195-
wrapper.setWorkingDir(outputDirectory);
196-
197-
List<String> kingArgs = new ArrayList<>();
198-
kingArgs.add(wrapper.getExe().getPath());
199-
200-
kingArgs.add("-b");
201-
kingArgs.add(plinkOutBed.getPath());
202-
203-
kingArgs.add("--prefix");
204-
kingArgs.add(SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()));
205-
206-
// Update the pedigree / fam file:
207-
String demographicsProviderName = getProvider().getParameterByName(PedigreeToolParameterDescriptor.NAME).extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx());
208-
if (demographicsProviderName != null)
209-
{
210-
File pedFile = ProcessVariantsHandler.getPedigreeFile(getPipelineCtx().getSourceDirectory(true), demographicsProviderName);
211-
if (!pedFile.exists())
212-
{
213-
throw new PipelineJobException("Unable to find pedigree file: " + pedFile.getPath());
214-
}
215-
216-
File kingFam = createFamFile(pedFile, new File(plinkOutBed.getParentFile(), "plink.fam"));
217-
kingArgs.add("--fam");
218-
kingArgs.add(kingFam.getPath());
219-
220-
output.addIntermediateFile(kingFam);
221-
}
222-
223-
if (threads != null)
224-
{
225-
kingArgs.add("--cpus");
226-
kingArgs.add(threads.toString());
227-
}
228-
229-
kingArgs.add("--kinship");
230-
kingArgs.add("--rplot");
231-
232-
File kinshipOutput = new File(outputDirectory, SequenceAnalysisService.get().getUnzippedBaseName(inputVCF.getName()) + ".kin");
233-
wrapper.execute(kingArgs);
234-
if (!kinshipOutput.exists())
235-
{
236-
throw new PipelineJobException("Unable to find file: " + kinshipOutput.getPath());
237-
}
238-
239-
File kinshipOutputTxt = new File(kinshipOutput.getPath() + ".txt.gz");
240-
if (kinshipOutputTxt.exists())
241-
{
242-
kinshipOutputTxt.delete();
243-
}
244-
245-
lineCount = SequencePipelineService.get().getLineCount(kinshipOutput)-1;
246-
try
247-
{
248-
Compress.compressGzip(kinshipOutput, kinshipOutputTxt);
249-
FileUtils.delete(kinshipOutput);
250-
}
251-
catch (IOException e)
252-
{
253-
throw new PipelineJobException(e);
254-
}
255-
256-
output.addSequenceOutput(kinshipOutputTxt, "King Relatedness: " + inputVCF.getName(), "KING Relatedness", null, null, genome.getGenomeId(), "Total lines: " + lineCount);
215+
output.addSequenceOutput(plinkOutKingFileTxt, "PLINK2/KING Relatedness: " + inputVCF.getName(), "PLINK2/KING Kinship", null, null, genome.getGenomeId(), "Total lines: " + lineCount);
257216

258217
return output;
259218
}
260219

261-
private File createFamFile(File pedFile, File famFile) throws PipelineJobException
262-
{
263-
File newFamFile = new File(famFile.getParentFile(), "king.fam");
264-
265-
Map<String, String> pedMap = new CaseInsensitiveHashMap<>();
266-
try (BufferedReader reader = Readers.getReader(pedFile))
267-
{
268-
String line;
269-
while ((line = reader.readLine()) != null)
270-
{
271-
String[] tokens = line.split(" ");
272-
if (tokens.length != 6)
273-
{
274-
throw new PipelineJobException("Improper ped line length: " + tokens.length);
275-
}
276-
277-
pedMap.put(tokens[1], StringUtils.join(Arrays.asList("0", tokens[1], tokens[2], tokens[3], tokens[4], "-9"), "\t"));
278-
}
279-
}
280-
catch (IOException e)
281-
{
282-
throw new PipelineJobException(e);
283-
}
284-
285-
try (BufferedReader reader = Readers.getReader(famFile);PrintWriter writer = PrintWriters.getPrintWriter(newFamFile))
286-
{
287-
String line;
288-
while ((line = reader.readLine()) != null)
289-
{
290-
String[] tokens = line.split("\t");
291-
if (tokens.length != 6)
292-
{
293-
throw new PipelineJobException("Improper ped line length: " + tokens.length);
294-
}
295-
296-
String newRow = pedMap.get(tokens[1]);
297-
if (newRow == null)
298-
{
299-
getPipelineCtx().getLogger().warn("Unable to find pedigree entry for: " + tokens[1] + ", reusing original");
300-
writer.println(line);
301-
}
302-
else
303-
{
304-
writer.println(newRow);
305-
}
306-
}
307-
}
308-
catch (IOException e)
309-
{
310-
throw new PipelineJobException(e);
311-
}
312-
313-
return newFamFile;
314-
}
315-
316220
public static class KingWrapper extends AbstractCommandWrapper
317221
{
318222
public KingWrapper(@Nullable Logger logger)

0 commit comments

Comments
 (0)