1313import htsjdk .variant .variantcontext .writer .VariantContextWriterBuilder ;
1414import htsjdk .variant .vcf .VCFFileReader ;
1515import htsjdk .variant .vcf .VCFHeader ;
16+ import org .apache .commons .io .FileUtils ;
1617import org .apache .commons .lang3 .StringUtils ;
1718import org .apache .commons .lang3 .tuple .Pair ;
1819import org .apache .log4j .Logger ;
2930import org .labkey .api .sequenceanalysis .pipeline .SequenceAnalysisJobSupport ;
3031import org .labkey .api .sequenceanalysis .pipeline .SequenceOutputHandler ;
3132import org .labkey .api .sequenceanalysis .pipeline .ToolParameterDescriptor ;
32- import org .labkey .api .util . FileUtil ;
33+ import org .labkey .api .sequenceanalysis . run . SimpleScriptWrapper ;
3334import org .labkey .api .util .PageFlowUtil ;
3435import org .labkey .api .util .StringUtilsLabKey ;
36+ import org .labkey .api .writer .PrintWriters ;
3537import org .labkey .sequenceanalysis .SequenceAnalysisModule ;
3638import org .labkey .sequenceanalysis .run .variant .SNPEffStep ;
3739import org .labkey .sequenceanalysis .run .variant .SnpEffWrapper ;
4244import java .io .FileOutputStream ;
4345import java .io .IOException ;
4446import java .io .OutputStreamWriter ;
47+ import java .io .PrintWriter ;
4548import java .nio .file .Files ;
4649import java .util .ArrayList ;
4750import java .util .Arrays ;
@@ -232,7 +235,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
232235 File indelOutput = new File (ctx .getOutputDir (), basename + "indels.txt.gz" );
233236 try (CSVWriter writer = new CSVWriter (new BufferedWriter (new OutputStreamWriter (new GZIPOutputStream (new FileOutputStream (indelOutput )), StringUtilsLabKey .DEFAULT_CHARSET )), '\t' , CSVWriter .NO_QUOTE_CHARACTER ))
234237 {
235- writer .writeNext (new String []{"ReadsetName" , "OutputFileId" , "ReadsetId" , "Source" , "Contig" , "Start" , "End" , "Ref" , "AltAllele" , "GatkDepth" , "LoFreqDepth" , "AltCount" , "AltAF" });
238+ writer .writeNext (new String []{"ReadsetName" , "OutputFileId" , "ReadsetId" , "Source" , "Contig" , "Start" , "End" , "Length" , " Ref" , "AltAllele" , "GatkDepth" , "LoFreqDepth" , "AltCount" , "AltAF" });
236239
237240 for (SequenceOutputFile so : inputFiles )
238241 {
@@ -282,7 +285,8 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
282285 List <Integer > depths = vc .getAttributeAsIntList ("DP4" , 0 );
283286 int alleleDepth = depths .get (2 ) + depths .get (3 );
284287
285- writer .writeNext (new String []{ctx .getSequenceSupport ().getCachedReadset (so .getReadset ()).getName (), String .valueOf (so .getRowid ()), String .valueOf (so .getReadset ()), "LoFreq" , vc .getContig (), String .valueOf (vc .getStart ()), String .valueOf (vc .getEnd ()), vc .getReference ().getBaseString (), vc .getAlternateAlleles ().get (0 ).getBaseString (), vc .getAttributeAsString ("GATK_DP" , "ND" ), vc .getAttributeAsString ("DP" , "ND" ), String .valueOf (alleleDepth ), vc .getAttributeAsString ("AF" , "ND" )});
288+ int length = vc .getLengthOnReference () - 1 ; //NOTE: indels are left-padded including one ref base
289+ writer .writeNext (new String []{ctx .getSequenceSupport ().getCachedReadset (so .getReadset ()).getName (), String .valueOf (so .getRowid ()), String .valueOf (so .getReadset ()), "LoFreq" , vc .getContig (), String .valueOf (vc .getStart ()), String .valueOf (vc .getEnd ()), String .valueOf (length ), vc .getReference ().getBaseString (), vc .getAlternateAlleles ().get (0 ).getBaseString (), vc .getAttributeAsString ("GATK_DP" , "ND" ), vc .getAttributeAsString ("DP" , "ND" ), String .valueOf (alleleDepth ), vc .getAttributeAsString ("AF" , "ND" )});
286290 }
287291
288292 for (int i = 0 ; i < vc .getLengthOnReference (); i ++)
@@ -320,13 +324,13 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
320324 String [] line ;
321325 while ((line = reader .readNext ()) != null )
322326 {
323- writer .writeNext (new String []{"Type" , "Contig" , "Start" , "End" , "Depth" , "ReadSupport" , "Fraction" });
324327 if (line [0 ].equals ("Type" ))
325328 {
326329 continue ;
327330 }
328331
329- writer .writeNext (new String []{ctx .getSequenceSupport ().getCachedReadset (so .getReadset ()).getName (), String .valueOf (so .getRowid ()), String .valueOf (so .getReadset ()), "Pindel" , line [1 ], line [2 ], line [3 ], "" , line [0 ], line [4 ], "" , line [5 ], line [6 ]});
332+ int length = Integer .parseInt (line [3 ]) - Integer .parseInt (line [2 ]) - 1 ; //NOTE: pindel reports one base upstream+downstream as part of the indel
333+ writer .writeNext (new String []{ctx .getSequenceSupport ().getCachedReadset (so .getReadset ()).getName (), String .valueOf (so .getRowid ()), String .valueOf (so .getReadset ()), "Pindel" , line [1 ], line [2 ], line [3 ], String .valueOf (length ), "" , line [0 ], line [4 ], "" , line [5 ], line [6 ]});
330334 }
331335 }
332336 }
@@ -341,6 +345,8 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
341345 throw new PipelineJobException (e );
342346 }
343347
348+ sortTsvFile (ctx , indelOutput );
349+
344350 if (!errors .isEmpty ())
345351 {
346352 errors .forEach (ctx .getLogger ()::error );
@@ -694,6 +700,41 @@ private int getReadDepth(File vcf, Map<String, Integer> contigToOffset, String c
694700 }
695701 }
696702
703+ private void sortTsvFile (JobContext ctx , File input ) throws PipelineJobException
704+ {
705+ File output = new File (input .getPath () + ".tmp" );
706+ File script = new File (input .getParentFile (), "script.sh" );
707+ try (PrintWriter writer = PrintWriters .getPrintWriter (script ))
708+ {
709+ writer .println ("#!/bin/bash" );
710+ writer .println ("set -x" );
711+ writer .println ("set -e" );
712+ writer .println ("{" );
713+ writer .println ("zcat " + input .getPath () +" | head -n 1;" );
714+ writer .println ("zcat " + input .getPath () +" | tail +2 | sort -k5,5 -k6,6n -k7,7n;" );
715+ writer .println ("} | gzip -c > " + output .getPath () + "\n " );
716+ }
717+ catch (IOException e )
718+ {
719+ throw new PipelineJobException (e );
720+ }
721+
722+ SimpleScriptWrapper wrapper = new SimpleScriptWrapper (ctx .getLogger ());
723+ wrapper .execute (Arrays .asList ("/bin/bash" , "script.sh" ));
724+
725+ input .delete ();
726+ script .delete ();
727+
728+ try
729+ {
730+ FileUtils .moveFile (output , input );
731+ }
732+ catch (IOException e )
733+ {
734+ throw new PipelineJobException (e );
735+ }
736+ }
737+
697738 private Map <File , VCFFileReader > readerMap = new HashMap <>();
698739
699740 private VCFFileReader getReader (File f )
0 commit comments