Skip to content

Commit d52fadd

Browse files
committed
Add depth filtering to bcftools consensus
1 parent b69f7ca commit d52fadd

File tree

2 files changed

+54
-22
lines changed

2 files changed

+54
-22
lines changed

SequenceAnalysis/resources/external/viral_consensus.sh

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ set -x
66
BAM=$1
77
FASTA=$2
88
MASK_BED=$3
9-
BCFTOOLS=${4:-bcftools}
9+
MIN_DEPTH=$4
10+
BCFTOOLS=${5:-bcftools}
1011

1112
VCF_CALLS=calls.vcf.gz
1213
THREADS=1
@@ -37,29 +38,37 @@ COUNT2=`$BCFTOOLS view -H $VCF_NORM | wc -l`
3738
echo 'Variants remaining: '$COUNT2
3839
echo -e 'VariantsAfterNorm\t'$COUNT2 >> $REPORT
3940

41+
# filter on depth
42+
echo 'Filtering indel clusters. Note: this is not currently used in the consensus.'
43+
VCF_DEPTH_FILTER=calls.norm.filter.vcf.gz
44+
$BCFTOOLS filter -e "DP < $MIN_DEPTH" -Oz -o $VCF_DEPTH_FILTER $VCF_NORM
45+
$BCFTOOLS index -t -f $VCF_DEPTH_FILTER
46+
COUNT3=`$BCFTOOLS view -H $VCF_DEPTH_FILTER | wc -l`
47+
echo 'Variants remaining: '$COUNT3
48+
echo -e 'VariantsAfterDpethFilter\t'$COUNT3 >> $REPORT
49+
4050
# filter adjacent indels within 5bp
4151
echo 'Filtering indel clusters. Note: this is not currently used in the consensus.'
4252
VCF_INDEL_FILTER=calls.norm.flt-indels.vcf.gz
43-
$BCFTOOLS filter --IndelGap 5 -Oz -o $VCF_INDEL_FILTER $VCF_NORM
53+
$BCFTOOLS filter --IndelGap 5 -Oz -o $VCF_INDEL_FILTER $VCF_DEPTH_FILTER
4454
$BCFTOOLS index -t -f $VCF_INDEL_FILTER
4555
COUNT3=`$BCFTOOLS view -H $VCF_INDEL_FILTER | wc -l`
4656
echo 'Variants that would remain: '$COUNT3
4757
echo -e 'VariantsAfterIndelFilter\t'$COUNT3 >> $REPORT
4858

49-
#At the moment, do not use the filtered version:
59+
#At the moment, do not use the indel-filtered version:
5060
VCF_FOR_CONSENSUS=`basename $BAM .bam`".calls.vcf.gz"
51-
mv $VCF_NORM $VCF_FOR_CONSENSUS
52-
mv ${VCF_NORM}.tbi ${VCF_FOR_CONSENSUS}.tbi
61+
mv $VCF_DEPTH_FILTER $VCF_FOR_CONSENSUS
62+
mv ${VCF_DEPTH_FILTER}.tbi ${VCF_FOR_CONSENSUS}.tbi
5363

5464
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT $VCF_FOR_CONSENSUS
5565
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT_IUPAC --iupac-codes $VCF_FOR_CONSENSUS
5666

5767
rm $VCF_CALLS
5868
rm ${VCF_CALLS}.tbi
5969

60-
#Previously moved:
61-
#rm $VCF_NORM
62-
#rm ${VCF_NORM}.tbi
70+
rm $VCF_NORM
71+
rm ${VCF_NORM}.tbi
6372

6473
rm $VCF_INDEL_FILTER
6574
rm ${VCF_INDEL_FILTER}.tbi

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper;
3535
import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper;
3636
import org.labkey.api.util.FileUtil;
37+
import org.labkey.api.writer.PrintWriters;
3738
import org.labkey.sequenceanalysis.SequenceAnalysisModule;
3839
import org.labkey.sequenceanalysis.run.util.DepthOfCoverageWrapper;
3940
import org.labkey.sequenceanalysis.run.variant.SNPEffStep;
@@ -42,6 +43,7 @@
4243
import java.io.File;
4344
import java.io.IOException;
4445
import java.io.InputStream;
46+
import java.io.PrintWriter;
4547
import java.util.ArrayList;
4648
import java.util.Arrays;
4749
import java.util.Collections;
@@ -247,7 +249,7 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
247249
getPipelineCtx().getLogger().info("Total positions with coverage below threshold (" + minCoverage + "): " + positionsSkipped);
248250
getPipelineCtx().getLogger().info("Total intervals of these gaps: " + gapIntervals);
249251

250-
consensusWrapper.execute(Arrays.asList("/bin/bash", script.getPath(), inputBam.getPath(), referenceGenome.getWorkingFastaFile().getPath(), mask.getPath()));
252+
consensusWrapper.execute(Arrays.asList("/bin/bash", script.getPath(), inputBam.getPath(), referenceGenome.getWorkingFastaFile().getPath(), mask.getPath(), String.valueOf(minCoverage)));
251253
File calls = new File(inputBam.getParentFile(), FileUtil.getBaseName(inputBam) + ".calls.vcf.gz");
252254

253255
Set<VariantContext> variantsBcftoolsOnly = new HashSet<>();
@@ -269,20 +271,20 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
269271
}
270272
}
271273

272-
String description = String.format("Total Variants: %s\nTotal GT 1 PCT: %s\nTotal GT 50 PCT: %s\nTotal Indel GT 1 PCT: %s", totalVariants, totalGT1, totalGT50, totalIndelGT1);
274+
String description = String.format("Total Variants: %s\nTotal GT 1 PCT: %s\nTotal GT 50 PCT: %s\nTotal Indel GT 1 PCT: %s\tPositions Below Coverage: %s", totalVariants, totalGT1, totalGT50, totalIndelGT1, positionsSkipped);
273275

274276
if (!variantsBcftoolsOnly.isEmpty())
275277
{
276278
getPipelineCtx().getLogger().error("The following variants were in bcftools, but not GT50% in lofreq: ");
277-
variantsBcftoolsOnly.forEach(vc -> getPipelineCtx().getLogger().error(getHashKey(vc)));
279+
variantsBcftoolsOnly.forEach(vc -> getPipelineCtx().getLogger().error(getHashKey(vc) + ", DP=" + vc.getAttribute("DP")));
278280

279281
description += "\n" + "WARNING: " + variantsBcftoolsOnly.size() + " variants detected in bcftools and not lofreq";
280282
}
281283

282284
if (!alleleToAF.isEmpty())
283285
{
284286
getPipelineCtx().getLogger().error("The following variants were GT50% in lofreq, but not in bcftools: ");
285-
alleleToAF.keySet().forEach(vc -> getPipelineCtx().getLogger().error(vc));
287+
alleleToAF.keySet().forEach(vc -> getPipelineCtx().getLogger().error(vc + ", AF=" + alleleToAF.get(vc)));
286288

287289
description += "\n" + "WARNING: " + alleleToAF.size() + " variants detected in lofreq and not bcftools";
288290
}
@@ -293,23 +295,44 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
293295
throw new PipelineJobException("Expected file not found: " + consensusFasta.getPath());
294296
}
295297

298+
DNASequence seq;
296299
try (InputStream is = IOUtil.openFileForReading(consensusFasta))
297300
{
298301
FastaReader<DNASequence, NucleotideCompound> fastaReader = new FastaReader<>(is, new GenericFastaHeaderParser<>(), new DNASequenceCreator(AmbiguityDNACompoundSet.getDNACompoundSet()));
299302
LinkedHashMap<String, DNASequence> fastaData = fastaReader.process();
300303

301-
for (String fastaHeader : fastaData.keySet())
302-
{
303-
AtomicInteger totalN = new AtomicInteger();
304-
DNASequence seq = fastaData.get(fastaHeader);
305-
seq.forEach(nt -> {
306-
if (nt.getUpperedBase().equals("N")) {
307-
totalN.getAndIncrement();
308-
}
309-
});
304+
AtomicInteger totalN = new AtomicInteger();
305+
seq = fastaData.values().iterator().next();
306+
seq.forEach(nt -> {
307+
if (nt.getUpperedBase().equals("N")) {
308+
totalN.getAndIncrement();
309+
}
310+
});
310311

311-
description += "\nConsensus Ns: " + totalN.get();
312+
description += "\nConsensus Ns: " + totalN.get();
313+
}
314+
catch (IOException e)
315+
{
316+
throw new PipelineJobException(e);
317+
}
318+
319+
//Replace FASTA header:
320+
try (PrintWriter writer = PrintWriters.getPrintWriter(consensusFasta))
321+
{
322+
StringBuilder header = new StringBuilder();
323+
if (rs.getSubjectId() != null)
324+
{
325+
header.append(rs.getSubjectId()).append("|");
312326
}
327+
else
328+
{
329+
header.append(rs.getName()).append("|");
330+
}
331+
332+
header.append(rs.getLibraryType() == null ? rs.getApplication() : rs.getLibraryType());
333+
334+
writer.println(">" + header);
335+
writer.println(seq.getSequenceAsString());
313336
}
314337
catch (IOException e)
315338
{

0 commit comments

Comments
 (0)