Skip to content

Commit e572022

Browse files
committed
Add consensus generation to LoFreq output
1 parent bd92167 commit e572022

File tree

2 files changed

+150
-17
lines changed

2 files changed

+150
-17
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#!/bin/bash
2+
3+
set -e
4+
set -x
5+
6+
BAM=$1
7+
FASTA=$2
8+
MASK_BED=$3
9+
BCFTOOLS=${4:-bcftools}
10+
11+
VCF_CALLS=calls.vcf.gz
12+
THREADS=1
13+
if [ ! -z $SEQUENCEANALYSIS_MAX_THREADS ];then
14+
THREADS=$SEQUENCEANALYSIS_MAX_THREADS
15+
fi
16+
17+
REPORT=`basename $BAM .bam`".consensus.report.txt"
18+
truncate -s 0 $REPORT
19+
20+
OUT=`basename $BAM .bam`".consensus.fasta"
21+
OUT_IUPAC=`basename $BAM .bam`".consensus.iupac.fasta"
22+
23+
# call variants
24+
echo 'Calling variants'
25+
$BCFTOOLS mpileup -Ou -d 20000 -f $FASTA $BAM | $BCFTOOLS call --ploidy 1 --threads $THREADS -mv -Oz -o $VCF_CALLS
26+
$BCFTOOLS index $VCF_CALLS
27+
COUNT1=`$BCFTOOLS view -H $VCF_CALLS | wc -l`
28+
echo 'Variants called: '$COUNT1
29+
echo -e 'VariantsCalled\t'$COUNT1 >> $REPORT
30+
31+
# normalize indels
32+
echo 'Normalize indels'
33+
VCF_NORM=calls.norm.bcf
34+
$BCFTOOLS norm -f $FASTA --threads $THREADS -Ob -o $VCF_NORM $VCF_CALLS
35+
$BCFTOOLS index -f $VCF_NORM
36+
COUNT2=`$BCFTOOLS view -H $VCF_NORM | wc -l`
37+
echo 'Variants remaining: '$COUNT2
38+
echo -e 'VariantsAfterNorm\t'$COUNT2 >> $REPORT
39+
40+
# filter adjacent indels within 5bp
41+
echo 'Filtering indel clusters. Note: this is not currently used in the consensus.'
42+
VCF_INDEL_FILTER=calls.norm.flt-indels.bcf
43+
$BCFTOOLS filter --IndelGap 5 -Ob -o $VCF_INDEL_FILTER $VCF_NORM
44+
$BCFTOOLS index -f $VCF_INDEL_FILTER
45+
COUNT3=`$BCFTOOLS view -H $VCF_INDEL_FILTER | wc -l`
46+
echo 'Variants that would remain: '$COUNT3
47+
echo -e 'VariantsAfterIndelFilter\t'$COUNT3 >> $REPORT
48+
49+
#At the moment, do not user the filtered version:
50+
VCF_FOR_CONSENSUS=$VCF_NORM
51+
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT $VCF_FOR_CONSENSUS
52+
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT_IUPAC --iupac-codes $VCF_FOR_CONSENSUS
53+
54+
rm $VCF_CALLS
55+
rm ${VCF_CALLS}.csi
56+
57+
rm $VCF_NORM
58+
rm ${VCF_NORM}.csi
59+
60+
rm $VCF_INDEL_FILTER
61+
rm ${VCF_INDEL_FILTER}.csi

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java

Lines changed: 89 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
package org.labkey.sequenceanalysis.run.analysis;
22

3+
import au.com.bytecode.opencsv.CSVReader;
4+
import au.com.bytecode.opencsv.CSVWriter;
35
import htsjdk.samtools.util.CloseableIterator;
6+
import htsjdk.samtools.util.IOUtil;
7+
import htsjdk.samtools.util.Interval;
48
import htsjdk.variant.variantcontext.VariantContext;
59
import htsjdk.variant.vcf.VCFFileReader;
610
import org.apache.log4j.Logger;
711
import org.jetbrains.annotations.Nullable;
812
import org.json.JSONObject;
913
import org.labkey.api.pipeline.PipelineJobException;
14+
import org.labkey.api.reader.Readers;
1015
import org.labkey.api.sequenceanalysis.SequenceAnalysisService;
1116
import org.labkey.api.sequenceanalysis.model.AnalysisModel;
1217
import org.labkey.api.sequenceanalysis.model.Readset;
@@ -53,7 +58,11 @@ public Provider()
5358
put("extensions", Arrays.asList("gtf", "gff"));
5459
put("width", 400);
5560
put("allowBlank", false);
56-
}}, null)
61+
}}, null),
62+
ToolParameterDescriptor.create("minCoverage", "Min Coverage For Consensus", "If provided, a consensus will only be called over regions with at least this depth", "ldk-integerfield", new JSONObject(){{
63+
put("minValue", 0);
64+
}}, 25)
65+
5766
), null, "http://csb5.github.io/lofreq/");
5867
}
5968

@@ -127,22 +136,85 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
127136
}
128137

129138
//generate consensus
130-
// File script = new File(SequenceAnalysisService.get().getScriptPath(SequenceAnalysisModule.NAME, "external/viral_consensus.sh"));
131-
// if (!script.exists())
132-
// {
133-
// throw new PipelineJobException("Unable to find script: " + script.getPath());
134-
// }
135-
//
136-
// SimpleScriptWrapper consensusWrapper = new SimpleScriptWrapper(getPipelineCtx().getLogger());
137-
// consensusWrapper.setWorkingDir(getPipelineCtx().getWorkingDirectory());
138-
//
139-
// Integer maxThreads = SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger());
140-
// if (maxThreads != null)
141-
// {
142-
// consensusWrapper.addToEnvironment("SEQUENCEANALYSIS_MAX_THREADS", maxThreads.toString());
143-
// }
144-
//
145-
// consensusWrapper.execute(Arrays.asList("/bin/bash", script.getName(), inputBam.getPath(), referenceGenome.getWorkingFastaFile().getPath()));
139+
File script = new File(SequenceAnalysisService.get().getScriptPath(SequenceAnalysisModule.NAME, "external/viral_consensus.sh"));
140+
if (!script.exists())
141+
{
142+
throw new PipelineJobException("Unable to find script: " + script.getPath());
143+
}
144+
145+
SimpleScriptWrapper consensusWrapper = new SimpleScriptWrapper(getPipelineCtx().getLogger());
146+
consensusWrapper.setWorkingDir(getPipelineCtx().getWorkingDirectory());
147+
148+
Integer maxThreads = SequencePipelineService.get().getMaxThreads(getPipelineCtx().getLogger());
149+
if (maxThreads != null)
150+
{
151+
consensusWrapper.addToEnvironment("SEQUENCEANALYSIS_MAX_THREADS", maxThreads.toString());
152+
}
153+
154+
//Create a BED file with all regions of coverage below MIN_COVERAGE:
155+
int minCoverage = getProvider().getParameterByName("minCoverage").extractValue(getPipelineCtx().getJob(), getProvider(), getStepIdx(), Integer.class);
156+
int positionsSkipped = 0;
157+
int gapIntervals = 0;
158+
159+
File mask = new File(outputDir, "mask.bed");
160+
try (CSVReader reader = new CSVReader(Readers.getReader(coverageOut), '\t');CSVWriter writer = new CSVWriter(IOUtil.openFileForBufferedUtf8Writing(mask), '\t', CSVWriter.NO_QUOTE_CHARACTER))
161+
{
162+
String[] line;
163+
164+
Interval intervalOfCurrentGap = null;
165+
166+
while ((line = reader.readNext()) != null)
167+
{
168+
String[] tokens = line[0].split(":");
169+
int depth = Integer.parseInt(line[1]);
170+
171+
if (depth < minCoverage)
172+
{
173+
positionsSkipped++;
174+
175+
if (intervalOfCurrentGap != null)
176+
{
177+
if (intervalOfCurrentGap.getContig().equals(tokens[0]))
178+
{
179+
//extend
180+
intervalOfCurrentGap = new Interval(intervalOfCurrentGap.getContig(), intervalOfCurrentGap.getStart(), Integer.parseInt(tokens[1]));
181+
}
182+
else
183+
{
184+
//switched contigs, write and make new:
185+
writer.writeNext(new String[]{intervalOfCurrentGap.getContig(), String.valueOf(intervalOfCurrentGap.getStart()-1), String.valueOf(intervalOfCurrentGap.getEnd())});
186+
gapIntervals++;
187+
intervalOfCurrentGap = new Interval(tokens[0], Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]));
188+
}
189+
}
190+
else
191+
{
192+
//Not existing gap, just start one:
193+
intervalOfCurrentGap = new Interval(tokens[0], Integer.parseInt(tokens[1]), Integer.parseInt(tokens[1]));
194+
}
195+
}
196+
else
197+
{
198+
//We just existed a gap, so write:
199+
if (intervalOfCurrentGap != null)
200+
{
201+
writer.writeNext(new String[]{intervalOfCurrentGap.getContig(), String.valueOf(intervalOfCurrentGap.getStart()-1), String.valueOf(intervalOfCurrentGap.getEnd())});
202+
gapIntervals++;
203+
}
204+
205+
intervalOfCurrentGap = null;
206+
}
207+
}
208+
}
209+
catch (IOException e)
210+
{
211+
throw new PipelineJobException(e);
212+
}
213+
214+
getPipelineCtx().getLogger().info("Total positions with coverage below threshold (" + minCoverage + "): " + positionsSkipped);
215+
getPipelineCtx().getLogger().info("Total intervals of these gaps: " + gapIntervals);
216+
217+
consensusWrapper.execute(Arrays.asList("/bin/bash", script.getName(), inputBam.getPath(), referenceGenome.getWorkingFastaFile().getPath(), mask.getPath()));
146218

147219
String description = String.format("Total Variants: %s\nTotal GT 1 PCT: %s\nTotal Indel GT 1 PCT: %s", totalVariants, totalGT1, totalIndelGT1);
148220
output.addSequenceOutput(outputVcfSnpEff, "LoFreq: " + rs.getName(), CATEGORY, rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);

0 commit comments

Comments
 (0)