55import htsjdk .samtools .SAMSequenceRecord ;
66import htsjdk .samtools .SamPairUtil ;
77import htsjdk .samtools .metrics .MetricsFile ;
8+ import htsjdk .samtools .reference .IndexedFastaSequenceFile ;
9+ import htsjdk .samtools .reference .ReferenceSequence ;
810import htsjdk .variant .utils .SAMSequenceDictionaryExtractor ;
911import org .json .JSONObject ;
1012import org .labkey .api .pipeline .PipelineJobException ;
3638import java .nio .file .Files ;
3739import java .util .ArrayList ;
3840import java .util .Arrays ;
41+ import java .util .HashMap ;
3942import java .util .List ;
43+ import java .util .Map ;
4044import java .util .stream .Stream ;
4145
4246public class PindelAnalysis extends AbstractPipelineStep implements AnalysisStep
@@ -230,7 +234,7 @@ public static File runPindel(AnalysisOutputImpl output, PipelineContext ctx, Rea
230234 File outTsv = new File (outDir , FileUtil .getBaseName (inputBam ) + ".pindel.txt" );
231235 try (CSVWriter writer = new CSVWriter (PrintWriters .getPrintWriter (outTsv ), '\t' , CSVWriter .NO_QUOTE_CHARACTER ))
232236 {
233- writer .writeNext (new String []{"Type" , "Contig" , "Start" , "End" , "Depth" , "ReadSupport" , "Fraction" , "Alt" , " MeanFlankingCoverage" , "LeadingCoverage" , "TrailingCoverage" , "EventCoverage" });
237+ writer .writeNext (new String []{"Type" , "Contig" , "Start" , "End" , "Depth" , "ReadSupport" , "Fraction" , "MeanFlankingCoverage" , "LeadingCoverage" , "TrailingCoverage" , "EventCoverage" , "Ref" , "Alt" , "PindelAllele " });
234238 parsePindelOutput (ctx , writer , new File (outPrefix .getPath () + "_D" ), minFraction , minDepth , gatkDepth , fasta );
235239 parsePindelOutput (ctx , writer , new File (outPrefix .getPath () + "_SI" ), minFraction , minDepth , gatkDepth , fasta );
236240 parsePindelOutput (ctx , writer , new File (outPrefix .getPath () + "_LI" ), minFraction , minDepth , gatkDepth , fasta );
@@ -256,8 +260,11 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
256260
257261 int totalPassing = 0 ;
258262 int totalFiltered = 0 ;
259- try (BufferedReader reader = Readers .getReader (pindelFile ))
263+ Map <String , ReferenceSequence > contigMap = new HashMap <>();
264+ try (BufferedReader reader = Readers .getReader (pindelFile );IndexedFastaSequenceFile iff = new IndexedFastaSequenceFile (fasta ))
260265 {
266+ final int WINDOW_SIZE = 50 ;
267+
261268 String line ;
262269 while ((line = reader .readLine ()) != null )
263270 {
@@ -272,10 +279,12 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
272279 }
273280
274281 String contig = tokens [3 ].split (" " )[1 ];
275- int start = Integer .parseInt (tokens [4 ].split (" " )[1 ]);
282+
283+ // Example 26154-26158 (3 bp, reporting padded borders)
284+ int basePriorToStart = Integer .parseInt (tokens [4 ].split (" " )[1 ]);
276285
277286 // Capture depth before/after event:
278- int depth = getGatkDepth (ctx , gatkDepthFile , contig , start );
287+ int depth = getGatkDepth (ctx , gatkDepthFile , contig , basePriorToStart );
279288 if (depth == 0 )
280289 {
281290 totalFiltered ++;
@@ -284,8 +293,8 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
284293
285294 int i = 0 ;
286295 double leadingCoverage = 0.0 ;
287- while (i < 20 ) {
288- int pos = start - i ;
296+ while (i < WINDOW_SIZE ) {
297+ int pos = basePriorToStart - i ;
289298 if (pos < 1 )
290299 {
291300 break ;
@@ -297,8 +306,9 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
297306
298307 leadingCoverage = leadingCoverage / i ;
299308
300- String alt = tokens [2 ].split (" " )[2 ];
301- alt = alt .replaceAll ("\" " , "" );
309+ //NOTE: this is the indel region itself, no flanking. so for a deletion with REF/ALT of ATTC / A--C, it reports TT. for an insertion of ATT / AGTT, it reports G
310+ String pindelAllele = tokens [2 ].split (" " )[2 ];
311+ pindelAllele = pindelAllele .replaceAll ("\" " , "" );
302312
303313 File dict = new File (fasta .getPath ().replace ("fasta" , "dict" ));
304314 if (!dict .exists ())
@@ -309,12 +319,16 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
309319 SAMSequenceDictionary extractor = SAMSequenceDictionaryExtractor .extractDictionary (dict .toPath ());
310320 SAMSequenceRecord rec = extractor .getSequence (contig );
311321
312- int end = Integer .parseInt (tokens [5 ]);
322+ String type = tokens [1 ].split (" " )[0 ];
323+ int baseAfterEnd = Integer .parseInt (tokens [5 ]);
324+ int trueEnd = "I" .equals (type ) ? baseAfterEnd : baseAfterEnd - 1 ;
325+
326+ // Capture depth before/after event:
313327 int j = 0 ;
314328 double trailingCoverage = 0.0 ;
315- while (j < 20 )
329+ while (j < WINDOW_SIZE )
316330 {
317- int pos = end + j ;
331+ int pos = baseAfterEnd + j ;
318332 if (pos > rec .getSequenceLength ())
319333 {
320334 break ;
@@ -326,26 +340,50 @@ private static void parsePindelOutput(PipelineContext ctx, CSVWriter writer, Fil
326340
327341 trailingCoverage = trailingCoverage / j ;
328342
329- String type = tokens [1 ].split (" " )[0 ];
330343 Double eventCoverage = null ;
331- if ("D" .equals (type ))
344+ if ("D" .equals (type ) || "INV" . equals ( type ) )
332345 {
333346 eventCoverage = 0.0 ;
334- int pos = start ;
335- while (pos < end )
347+ int pos = basePriorToStart ;
348+ while (pos < baseAfterEnd )
336349 {
337350 pos ++;
338351 eventCoverage += getGatkDepth (ctx , gatkDepthFile , contig , pos );
339352 }
340353
341- eventCoverage = eventCoverage / (end - start - 1 );
354+ eventCoverage = eventCoverage / (baseAfterEnd - basePriorToStart - 1 );
342355 }
343356
344357 double meanCoverage = (leadingCoverage + trailingCoverage ) / 2.0 ;
345358 double pct = (double )support / meanCoverage ;
346359 if (pct >= minFraction )
347360 {
348- writer .writeNext (new String []{type , contig , String .valueOf (start ), String .valueOf (end ), String .valueOf (depth ), String .valueOf (support ), String .valueOf (pct ), alt , String .valueOf (meanCoverage ), String .valueOf (leadingCoverage ), String .valueOf (trailingCoverage ), (eventCoverage == null ? "" : String .valueOf (eventCoverage ))});
361+ if (!contigMap .containsKey (contig ))
362+ {
363+ contigMap .put (contig , iff .getSequence (contig ));
364+ }
365+
366+ ReferenceSequence sequence = contigMap .get (contig );
367+ String alt = "" ;
368+ String ref = "" ;
369+ if ("I" .equals (type ))
370+ {
371+ ref = sequence .getBaseString ().substring (basePriorToStart -1 , basePriorToStart );
372+ alt = ref + pindelAllele ;
373+ }
374+ else if ("D" .equals (type ))
375+ {
376+ ref = sequence .getBaseString ().substring (basePriorToStart -1 , trueEnd );
377+ alt = sequence .getBaseString ().substring (basePriorToStart -1 , basePriorToStart );
378+
379+ String predictedPindelAllele = ref + pindelAllele ;
380+ if (!predictedPindelAllele .equals (ref ))
381+ {
382+ throw new IllegalArgumentException ("Unexpected pindel allele: " + ref + " / " + predictedPindelAllele );
383+ }
384+ }
385+
386+ writer .writeNext (new String []{type , contig , String .valueOf (basePriorToStart ), String .valueOf (trueEnd ), String .valueOf (depth ), String .valueOf (support ), String .valueOf (pct ), String .valueOf (meanCoverage ), String .valueOf (leadingCoverage ), String .valueOf (trailingCoverage ), (eventCoverage == null ? "" : String .valueOf (eventCoverage )), ref , alt , pindelAllele });
349387 totalPassing ++;
350388 }
351389 else
0 commit comments