Skip to content

Commit 284d9ef

Browse files
committed
Enhance interpretation of pindel results
1 parent c2fe35c commit 284d9ef

File tree

3 files changed

+155
-23
lines changed

3 files changed

+155
-23
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/LofreqAnalysis.java

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,11 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
613613
File pindelOutput = PindelAnalysis.runPindel(output, getPipelineCtx(), rs, outputDir, inputBam, referenceGenome.getWorkingFastaFile(), minFraction, minDepth, true, coverageOut, minInsertSize);
614614
try (CSVReader reader = new CSVReader(Readers.getReader(pindelOutput), '\t'))
615615
{
616+
final int MAX_DEL_EVENT_COVERAGE = 20;
617+
final double MIN_AF = 0.25;
618+
final int MIN_LENGTH_TO_CONSIDER = 10;
619+
final int MAX_DELETION_LENGTH = 5000;
620+
616621
String[] line;
617622
while ((line = reader.readNext()) != null)
618623
{
@@ -621,10 +626,50 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
621626
continue;
622627
}
623628

624-
if (Double.parseDouble(line[6]) >= 0.35)
629+
int start = Integer.parseInt(line[2]); //1-based, coordinate prior, like VCF
630+
int end = Integer.parseInt(line[3]); //1-based, actual coordinate, like VCF
631+
String refAllele = line[11];
632+
String altAllele = line[12];
633+
int refLength = end - start;
634+
int altLength = altAllele.length();
635+
636+
// Assume LoFreq calls these well enough:
637+
if (refLength < MIN_LENGTH_TO_CONSIDER && altLength < MIN_LENGTH_TO_CONSIDER)
638+
{
639+
continue;
640+
}
641+
642+
if ("D".equals(line[0]) && refLength > MAX_DELETION_LENGTH)
643+
{
644+
continue;
645+
}
646+
647+
if (Double.parseDouble(line[6]) < MIN_AF)
625648
{
626-
indelMap.put(line[0], indelMap.getOrDefault(line[0], 0) + 1);
649+
continue;
650+
}
651+
652+
double eventCoverage = 0.0;
653+
if (StringUtils.trimToNull(line[11]) != null)
654+
{
655+
eventCoverage = Double.parseDouble(line[11]);
656+
}
657+
658+
if ("D".equals(line[0]) && eventCoverage > MAX_DEL_EVENT_COVERAGE)
659+
{
660+
continue;
627661
}
662+
663+
indelMap.put(line[0], indelMap.getOrDefault(line[0], 0) + 1);
664+
665+
VariantContextBuilder vcb = new VariantContextBuilder();
666+
vcb.start(start);
667+
vcb.stop(end);
668+
vcb.chr(line[1]);
669+
vcb.alleles(Arrays.asList(Allele.create(refAllele, true), Allele.create(altAllele)));
670+
vcb.attribute("AF", Double.parseDouble(line[6]));
671+
int dp = "I".equals(line[0]) ? Integer.parseInt(line[4]) : (int)Double.parseDouble(line[10]);
672+
vcb.attribute("DP", dp);
628673
}
629674
}
630675
catch (IOException e)
@@ -670,7 +715,9 @@ public Output performAnalysisPerSampleRemote(Readset rs, File inputBam, Referenc
670715
if (pangolinData != null)
671716
{
672717
writer.writeNext(new String[]{"Pangolin", "PangolinLineage", pangolinData[1]});
673-
writer.writeNext(new String[]{"Pangolin", "PangolinLineageConfidence", pangolinData[2]});
718+
writer.writeNext(new String[]{"Pangolin", "PangolinConflicts", pangolinData[2]});
719+
writer.writeNext(new String[]{"Pangolin", "PangolinVersions", pangolinData[3]});
720+
writer.writeNext(new String[]{"Pangolin", "PangolinVersions", pangolinData[4]});
674721
}
675722
else
676723
{

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/PangolinHandler.java

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,10 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport
104104
row1.put("metricName", "PangolinLineage");
105105
row1.put("qualvalue", line[1]);
106106
row1.put("container", so.getContainer());
107+
if (StringUtils.trimToNull(line[3]) != null)
108+
{
109+
row1.put("comment", line[3]);
110+
}
107111
toInsert.add(row1);
108112

109113
if (StringUtils.trimToNull(line[2]) != null)
@@ -113,11 +117,32 @@ public void processFilesOnWebserver(PipelineJob job, SequenceAnalysisJobSupport
113117
row2.put("readset", so.getReadset());
114118
row2.put("analysis_id", so.getAnalysis_id());
115119
row2.put("category", "Pangolin");
116-
row2.put("metricName", "PangolinLineageConfidence");
120+
row2.put("metricName", "PangolinConflicts");
117121
row2.put("value", Double.parseDouble(line[2]));
118122
row2.put("container", so.getContainer());
123+
if (StringUtils.trimToNull(line[3]) != null)
124+
{
125+
row2.put("comment", line[3]);
126+
}
119127
toInsert.add(row2);
120128
}
129+
130+
if (StringUtils.trimToNull(line[4]) != null)
131+
{
132+
Map<String, Object> row = new CaseInsensitiveHashMap<>();
133+
row.put("dataid", so.getDataId());
134+
row.put("readset", so.getReadset());
135+
row.put("analysis_id", so.getAnalysis_id());
136+
row.put("category", "Pangolin");
137+
row.put("metricName", "PangolinSummary");
138+
row.put("qualvalue", line[4]);
139+
row.put("container", so.getContainer());
140+
if (StringUtils.trimToNull(line[3]) != null)
141+
{
142+
row.put("comment", line[3]);
143+
}
144+
toInsert.add(row);
145+
}
121146
}
122147
}
123148
catch (IOException e)
@@ -169,7 +194,29 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
169194
for (SequenceOutputFile so : inputFiles)
170195
{
171196
String[] pangolinData = runPangolin(so.getFile(), ctx.getLogger(), ctx.getFileManager());
172-
writer.writeNext(new String[]{String.valueOf(so.getRowid()), (pangolinData == null ? "QC Fail" : pangolinData[1]), (pangolinData == null ? "" : pangolinData[2])});
197+
198+
List<String> versions = new ArrayList<>();
199+
if (pangolinData != null)
200+
{
201+
if (StringUtils.trimToNull(pangolinData[3]) != null)
202+
{
203+
versions.add("Pangolin version: " + pangolinData[3]);
204+
}
205+
206+
if (StringUtils.trimToNull(pangolinData[4]) != null)
207+
{
208+
versions.add("pangoLEARN version: " + pangolinData[4]);
209+
}
210+
211+
if (StringUtils.trimToNull(pangolinData[5]) != null)
212+
{
213+
versions.add("pango version: " + pangolinData[5]);
214+
}
215+
}
216+
217+
String comment = StringUtils.join(versions, ",");
218+
219+
writer.writeNext(new String[]{String.valueOf(so.getRowid()), (pangolinData == null ? "QC Fail" : pangolinData[1]), (pangolinData == null ? "" : pangolinData[2]), comment, (pangolinData == null ? "" : pangolinData[7])});
173220
}
174221
}
175222
catch (IOException e)
@@ -196,7 +243,7 @@ public static void updatePangolinRefs(Logger log) throws PipelineJobException
196243

197244
public static File getRenamedPangolinOutput(File consensusFasta)
198245
{
199-
return new File(consensusFasta.getParentFile(), FileUtil.getBaseName(consensusFasta) + ".pandolin.csv");
246+
return new File(consensusFasta.getParentFile(), FileUtil.getBaseName(consensusFasta) + ".pangolin.csv");
200247
}
201248

202249
public static String[] runPangolin(File consensusFasta, Logger log, PipelineOutputTracker tracker) throws PipelineJobException

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/PindelAnalysis.java

Lines changed: 55 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import htsjdk.samtools.SAMSequenceRecord;
66
import htsjdk.samtools.SamPairUtil;
77
import htsjdk.samtools.metrics.MetricsFile;
8+
import htsjdk.samtools.reference.IndexedFastaSequenceFile;
9+
import htsjdk.samtools.reference.ReferenceSequence;
810
import htsjdk.variant.utils.SAMSequenceDictionaryExtractor;
911
import org.json.JSONObject;
1012
import org.labkey.api.pipeline.PipelineJobException;
@@ -36,7 +38,9 @@
3638
import java.nio.file.Files;
3739
import java.util.ArrayList;
3840
import java.util.Arrays;
41+
import java.util.HashMap;
3942
import java.util.List;
43+
import java.util.Map;
4044
import java.util.stream.Stream;
4145

4246
public class PindelAnalysis extends AbstractPipelineStep implements AnalysisStep
@@ -230,7 +234,7 @@ public static File runPindel(AnalysisOutputImpl output, PipelineContext ctx, Rea
230234
File outTsv = new File(outDir, FileUtil.getBaseName(inputBam) + ".pindel.txt");
231235
try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(outTsv), '\t', CSVWriter.NO_QUOTE_CHARACTER))
232236
{
233-
writer.writeNext(new String[]{"Type", "Contig", "Start", "End", "Depth", "ReadSupport", "Fraction", "Alt", "MeanFlankingCoverage", "LeadingCoverage", "TrailingCoverage", "EventCoverage"});
237+
writer.writeNext(new String[]{"Type", "Contig", "Start", "End", "Depth", "ReadSupport", "Fraction", "MeanFlankingCoverage", "LeadingCoverage", "TrailingCoverage", "EventCoverage", "Ref", "Alt", "PindelAllele"});
234238
parsePindelOutput(ctx, writer, new File(outPrefix.getPath() + "_D"), minFraction, minDepth, gatkDepth, fasta);
235239
parsePindelOutput(ctx, writer, new File(outPrefix.getPath() + "_SI"), minFraction, minDepth, gatkDepth, fasta);
236240
parsePindelOutput(ctx, writer, new File(outPrefix.getPath() + "_LI"), minFraction, minDepth, gatkDepth, fasta);
@@ -256,8 +260,11 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
256260

257261
int totalPassing = 0;
258262
int totalFiltered = 0;
259-
try (BufferedReader reader = Readers.getReader(pindelFile))
263+
Map<String, ReferenceSequence> contigMap = new HashMap<>();
264+
try (BufferedReader reader = Readers.getReader(pindelFile);IndexedFastaSequenceFile iff = new IndexedFastaSequenceFile(fasta))
260265
{
266+
final int WINDOW_SIZE = 50;
267+
261268
String line;
262269
while ((line = reader.readLine()) != null)
263270
{
@@ -272,10 +279,12 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
272279
}
273280

274281
String contig = tokens[3].split(" ")[1];
275-
int start = Integer.parseInt(tokens[4].split(" ")[1]);
282+
283+
// Example 26154-26158 (3 bp, reporting padded borders)
284+
int basePriorToStart = Integer.parseInt(tokens[4].split(" ")[1]);
276285

277286
// Capture depth before/after event:
278-
int depth = getGatkDepth(ctx, gatkDepthFile, contig, start);
287+
int depth = getGatkDepth(ctx, gatkDepthFile, contig, basePriorToStart);
279288
if (depth == 0)
280289
{
281290
totalFiltered++;
@@ -284,8 +293,8 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
284293

285294
int i = 0;
286295
double leadingCoverage = 0.0;
287-
while (i < 20) {
288-
int pos = start - i;
296+
while (i < WINDOW_SIZE) {
297+
int pos = basePriorToStart - i;
289298
if (pos < 1)
290299
{
291300
break;
@@ -297,8 +306,9 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
297306

298307
leadingCoverage = leadingCoverage / i;
299308

300-
String alt = tokens[2].split(" ")[2];
301-
alt = alt.replaceAll("\"", "");
309+
//NOTE: this is the indel region itself, no flanking. so for a deletion with REF/ALT of ATTC / A--C, it reports TT. for an insertion of ATT / AGTT, it reports G
310+
String pindelAllele = tokens[2].split(" ")[2];
311+
pindelAllele = pindelAllele.replaceAll("\"", "");
302312

303313
File dict = new File(fasta.getPath().replace("fasta", "dict"));
304314
if (!dict.exists())
@@ -309,12 +319,16 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
309319
SAMSequenceDictionary extractor = SAMSequenceDictionaryExtractor.extractDictionary(dict.toPath());
310320
SAMSequenceRecord rec = extractor.getSequence(contig);
311321

312-
int end = Integer.parseInt(tokens[5]);
322+
String type = tokens[1].split(" ")[0];
323+
int baseAfterEnd = Integer.parseInt(tokens[5]);
324+
int trueEnd = "I".equals(type) ? baseAfterEnd : baseAfterEnd - 1;
325+
326+
// Capture depth before/after event:
313327
int j = 0;
314328
double trailingCoverage = 0.0;
315-
while (j < 20)
329+
while (j < WINDOW_SIZE)
316330
{
317-
int pos = end + j;
331+
int pos = baseAfterEnd + j;
318332
if (pos > rec.getSequenceLength())
319333
{
320334
break;
@@ -326,26 +340,50 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
326340

327341
trailingCoverage = trailingCoverage / j;
328342

329-
String type = tokens[1].split(" ")[0];
330343
Double eventCoverage = null;
331-
if ("D".equals(type))
344+
if ("D".equals(type) || "INV".equals(type))
332345
{
333346
eventCoverage = 0.0;
334-
int pos = start;
335-
while (pos < end)
347+
int pos = basePriorToStart;
348+
while (pos < baseAfterEnd)
336349
{
337350
pos++;
338351
eventCoverage += getGatkDepth(ctx, gatkDepthFile, contig, pos);
339352
}
340353

341-
eventCoverage = eventCoverage / (end - start - 1);
354+
eventCoverage = eventCoverage / (baseAfterEnd - basePriorToStart - 1);
342355
}
343356

344357
double meanCoverage = (leadingCoverage + trailingCoverage) / 2.0;
345358
double pct = (double)support / meanCoverage;
346359
if (pct >= minFraction)
347360
{
348-
writer.writeNext(new String[]{type, contig, String.valueOf(start), String.valueOf(end), String.valueOf(depth), String.valueOf(support), String.valueOf(pct), alt, String.valueOf(meanCoverage), String.valueOf(leadingCoverage), String.valueOf(trailingCoverage), (eventCoverage == null ? "" : String.valueOf(eventCoverage))});
361+
if (!contigMap.containsKey(contig))
362+
{
363+
contigMap.put(contig, iff.getSequence(contig));
364+
}
365+
366+
ReferenceSequence sequence = contigMap.get(contig);
367+
String alt = "";
368+
String ref = "";
369+
if ("I".equals(type))
370+
{
371+
ref = sequence.getBaseString().substring(basePriorToStart-1, basePriorToStart);
372+
alt = ref + pindelAllele;
373+
}
374+
else if ("D".equals(type))
375+
{
376+
ref = sequence.getBaseString().substring(basePriorToStart-1, trueEnd);
377+
alt = sequence.getBaseString().substring(basePriorToStart-1, basePriorToStart);
378+
379+
String predictedPindelAllele = ref + pindelAllele;
380+
if (!predictedPindelAllele.equals(ref))
381+
{
382+
throw new IllegalArgumentException("Unexpected pindel allele: " + ref + " / " + predictedPindelAllele);
383+
}
384+
}
385+
386+
writer.writeNext(new String[]{type, contig, String.valueOf(basePriorToStart), String.valueOf(trueEnd), String.valueOf(depth), String.valueOf(support), String.valueOf(pct), String.valueOf(meanCoverage), String.valueOf(leadingCoverage), String.valueOf(trailingCoverage), (eventCoverage == null ? "" : String.valueOf(eventCoverage)), ref, alt, pindelAllele});
349387
totalPassing++;
350388
}
351389
else

0 commit comments

Comments
 (0)