Skip to content

Commit ffb7d0b

Browse files
committed
More reporting around viral consensus generation
1 parent 1c2e423 commit ffb7d0b

File tree

2 files changed

+75
-14
lines changed

2 files changed

+75
-14
lines changed

SequenceAnalysis/resources/external/viral_consensus.sh

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,39 +23,43 @@ OUT_IUPAC=`basename $BAM .bam`".consensus.iupac.fasta"
2323
# call variants
2424
echo 'Calling variants'
2525
$BCFTOOLS mpileup -Ou -d 20000 -f $FASTA $BAM | $BCFTOOLS call --ploidy 1 --threads $THREADS -mv -Oz -o $VCF_CALLS
26-
$BCFTOOLS index $VCF_CALLS
26+
$BCFTOOLS index -t $VCF_CALLS
2727
COUNT1=`$BCFTOOLS view -H $VCF_CALLS | wc -l`
2828
echo 'Variants called: '$COUNT1
2929
echo -e 'VariantsCalled\t'$COUNT1 >> $REPORT
3030

3131
# normalize indels
3232
echo 'Normalize indels'
33-
VCF_NORM=calls.norm.bcf
34-
$BCFTOOLS norm -f $FASTA --threads $THREADS -Ob -o $VCF_NORM $VCF_CALLS
35-
$BCFTOOLS index -f $VCF_NORM
33+
VCF_NORM=calls.norm.vcf.gz
34+
$BCFTOOLS norm -f $FASTA --threads $THREADS -Oz -o $VCF_NORM $VCF_CALLS
35+
$BCFTOOLS index -t -f $VCF_NORM
3636
COUNT2=`$BCFTOOLS view -H $VCF_NORM | wc -l`
3737
echo 'Variants remaining: '$COUNT2
3838
echo -e 'VariantsAfterNorm\t'$COUNT2 >> $REPORT
3939

4040
# filter adjacent indels within 5bp
4141
echo 'Filtering indel clusters. Note: this is not currently used in the consensus.'
42-
VCF_INDEL_FILTER=calls.norm.flt-indels.bcf
43-
$BCFTOOLS filter --IndelGap 5 -Ob -o $VCF_INDEL_FILTER $VCF_NORM
44-
$BCFTOOLS index -f $VCF_INDEL_FILTER
42+
VCF_INDEL_FILTER=calls.norm.flt-indels.vcf.gz
43+
$BCFTOOLS filter --IndelGap 5 -Oz -o $VCF_INDEL_FILTER $VCF_NORM
44+
$BCFTOOLS index -t -f $VCF_INDEL_FILTER
4545
COUNT3=`$BCFTOOLS view -H $VCF_INDEL_FILTER | wc -l`
4646
echo 'Variants that would remain: '$COUNT3
4747
echo -e 'VariantsAfterIndelFilter\t'$COUNT3 >> $REPORT
4848

49-
#At the moment, do not user the filtered version:
50-
VCF_FOR_CONSENSUS=$VCF_NORM
49+
#At the moment, do not use the filtered version:
50+
VCF_FOR_CONSENSUS=`basename $BAM .bam`".calls.vcf.gz"
51+
mv $VCF_NORM $VCF_FOR_CONSENSUS
52+
mv ${VCF_NORM}.tbi ${VCF_FOR_CONSENSUS}.tbi
53+
5154
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT $VCF_FOR_CONSENSUS
5255
$BCFTOOLS consensus -f $FASTA -m $MASK_BED -o $OUT_IUPAC --iupac-codes $VCF_FOR_CONSENSUS
5356

5457
rm $VCF_CALLS
55-
rm ${VCF_CALLS}.csi
58+
rm ${VCF_CALLS}.tbi
5659

57-
rm $VCF_NORM
58-
rm ${VCF_NORM}.csi
60+
#Previously moved:
61+
#rm $VCF_NORM
62+
#rm ${VCF_NORM}.tbi
5963

6064
rm $VCF_INDEL_FILTER
61-
rm ${VCF_INDEL_FILTER}.csi
65+
rm ${VCF_INDEL_FILTER}.tbi

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import htsjdk.samtools.util.CloseableIterator;
66
import htsjdk.samtools.util.IOUtil;
77
import htsjdk.samtools.util.Interval;
8+
import htsjdk.variant.variantcontext.Allele;
89
import htsjdk.variant.variantcontext.VariantContext;
910
import htsjdk.variant.vcf.VCFFileReader;
1011
import org.apache.log4j.Logger;
@@ -37,7 +38,12 @@
3738
import java.util.ArrayList;
3839
import java.util.Arrays;
3940
import java.util.Collections;
41+
import java.util.HashMap;
42+
import java.util.HashSet;
4043
import java.util.List;
44+
import java.util.Map;
45+
import java.util.Set;
46+
import java.util.stream.Collectors;
4147

4248
public class LofreqAnalysis extends AbstractCommandPipelineStep<LofreqAnalysis.LofreqWrapper> implements AnalysisStep
4349
{
@@ -118,6 +124,7 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
118124
int totalVariants = 0;
119125
int totalGT1 = 0;
120126
int totalGT50 = 0;
127+
Map<String, Double> alleleToAF = new HashMap<>();
121128
int totalIndelGT1 = 0;
122129
try (VCFFileReader reader = new VCFFileReader(outputVcfSnpEff);CloseableIterator<VariantContext> it = reader.iterator())
123130
{
@@ -137,6 +144,10 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
137144
if (vc.hasAttribute("AF") && vc.getAttributeAsDouble("AF", 0.0) > 0.5)
138145
{
139146
totalGT50++;
147+
String key = getHashKey(vc);
148+
Double af = alleleToAF.getOrDefault(key, 0.0);
149+
af = Math.max(af, vc.getAttributeAsDouble("AF", 0.0));
150+
alleleToAF.put(key, af);
140151
}
141152
}
142153
}
@@ -228,10 +239,51 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
228239
getPipelineCtx().getLogger().info("Total intervals of these gaps: " + gapIntervals);
229240

230241
consensusWrapper.execute(Arrays.asList("/bin/bash", script.getPath(), inputBam.getPath(), referenceGenome.getWorkingFastaFile().getPath(), mask.getPath()));
242+
File calls = new File(inputBam.getParentFile(), FileUtil.getBaseName(inputBam) + ".calls.vcf.gz");
243+
244+
Set<VariantContext> variantsBcftoolsOnly = new HashSet<>();
245+
try (VCFFileReader reader = new VCFFileReader(calls);CloseableIterator<VariantContext> it = reader.iterator())
246+
{
247+
while (it.hasNext())
248+
{
249+
VariantContext vc = it.next();
250+
String key = getHashKey(vc);
251+
if (alleleToAF.containsKey(key))
252+
{
253+
//Variant shared
254+
alleleToAF.remove(key);
255+
}
256+
else
257+
{
258+
variantsBcftoolsOnly.add(vc);
259+
}
260+
}
261+
}
231262

232263
String description = String.format("Total Variants: %s\nTotal GT 1 PCT: %s\nTotal GT 50 PCT: %s\nTotal Indel GT 1 PCT: %s", totalVariants, totalGT1, totalGT50, totalIndelGT1);
264+
265+
if (!variantsBcftoolsOnly.isEmpty())
266+
{
267+
getPipelineCtx().getLogger().error("The following variants were in bcftools, but not GT50% in lofreq: ");
268+
variantsBcftoolsOnly.forEach(vc -> getPipelineCtx().getLogger().error(getHashKey(vc)));
269+
270+
description += "\n" + "WARNING: " + variantsBcftoolsOnly.size() + " variants detected in bcftools and not lofreq";
271+
}
272+
273+
if (!alleleToAF.isEmpty())
274+
{
275+
getPipelineCtx().getLogger().error("The following variants were GT50% in lofreq, but not in bcftools: ");
276+
alleleToAF.keySet().forEach(vc -> getPipelineCtx().getLogger().error(vc));
277+
278+
description += "\n" + "WARNING: " + alleleToAF.size() + " variants detected in lofreq and not bcftools";
279+
}
280+
281+
233282
output.addSequenceOutput(outputVcfSnpEff, "LoFreq: " + rs.getName(), CATEGORY, rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
234-
output.addSequenceOutput(coverageOut, "Depth of Coverage: " + rs.getName(), "Depth of Coverage", rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
283+
output.addSequenceOutput(coverageOut, "Depth of Coverage: " + rs.getName(), "Depth of Coverage", rs.getReadsetId(), null, referenceGenome.getGenomeId(), null);
284+
285+
File consensusFasta = new File(inputBam.getParentFile(), FileUtil.getBaseName(inputBam.getName()) + ".consensus.fasta");
286+
output.addSequenceOutput(consensusFasta, "Consensus: " + rs.getName(), "Viral Consensus Sequence", rs.getReadsetId(), null, referenceGenome.getGenomeId(), description);
235287

236288
return output;
237289
}
@@ -242,6 +294,11 @@ public Output performAnalysisPerSampleLocal(AnalysisModel model, File inputBam,
242294
return null;
243295
}
244296

297+
private String getHashKey(VariantContext vc)
298+
{
299+
return vc.getContig() + "<>" + vc.getStart() + vc.getAlternateAlleles().stream().map(Allele::getBaseString).collect(Collectors.joining(";"));
300+
}
301+
245302
public static class LofreqWrapper extends AbstractCommandWrapper
246303
{
247304
public LofreqWrapper(Logger log)

0 commit comments

Comments
 (0)