22
33import au .com .bytecode .opencsv .CSVReader ;
44import au .com .bytecode .opencsv .CSVWriter ;
5+ import com .google .common .io .Files ;
56import htsjdk .samtools .util .CloseableIterator ;
67import htsjdk .samtools .util .IOUtil ;
78import htsjdk .variant .variantcontext .Allele ;
@@ -1024,6 +1025,15 @@ private File getVariantTableName(JobContext ctx, File vcfInput)
10241025
10251026 private void inspectAndSummarizeVcf (JobContext ctx , File vcfInput , GeneToNameTranslator translator , ReferenceGenome genome , boolean generateSummaries ) throws PipelineJobException
10261027 {
1028+ File doneFile = new File (ctx .getWorkingDirectory (), "vcfInspect.done" );
1029+ ctx .getFileManager ().addIntermediateFile (doneFile );
1030+
1031+ if (doneFile .exists ())
1032+ {
1033+ ctx .getLogger ().info ("VCF inspection already done, skipping" );
1034+ return ;
1035+ }
1036+
10271037 long sitesInspected = 0L ;
10281038 long totalVariants = 0L ;
10291039 long totalPrivateVariants = 0L ;
@@ -1041,6 +1051,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
10411051
10421052 if (sitesInspected % 1000000 == 0 )
10431053 {
1054+ ctx .getJob ().setStatus (PipelineJob .TaskStatus .running , "Inspected " + sitesInspected + " variants" );
10441055 ctx .getLogger ().info ("inspected " + sitesInspected + " variants" );
10451056 }
10461057
@@ -1182,11 +1193,12 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11821193 if (vc .getAttribute ("CLN_SIG" ) != null )
11831194 {
11841195 List <String > clnSigs = vc .getAttributeAsStringList ("CLN_SIG" , "" );
1185- if (clnSigs .size () != vc .getAlternateAlleles ().size ())
1196+ if (clnSigs .size () != vc .getAlleles ().size ())
11861197 {
1187- throw new IllegalStateException ("CLN_SIG and alt alleles were not the same length: " + vc .toStringWithoutGenotypes ());
1198+ throw new IllegalStateException ("CLN_SIG and alleles were not the same length: " + vc .toStringWithoutGenotypes ());
11881199 }
11891200
1201+ // NOTE: we iterate REF + ALT here:
11901202 List <String > clnDisease = vc .getAttributeAsStringList ("CLN_DN" , "" );
11911203 List <String > clnAlleleIds = vc .getAttributeAsStringList ("CLN_ALLELEID" , "" );
11921204 int i = -1 ;
@@ -1198,7 +1210,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
11981210 continue ;
11991211 }
12001212
1201- Allele altAllele = vc .getAlternateAllele (i );
1213+ Allele a = vc .getAlleles (). get (i );
12021214
12031215 String [] sigSplit = sigList .split ("\\ |" );
12041216 List <String > diseaseSplit = Arrays .asList (clnDisease .get (i ).split ("\\ |" ));
@@ -1214,7 +1226,7 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
12141226
12151227 try
12161228 {
1217- maybeWriteVariantLine (queuedLines , vc , altAllele .getBaseString (), "ClinVar" , diseaseSplit .get (j ), description , overlappingGenes , omimIds , omimPhenotypes , ctx .getLogger (), "ClinVar:" + clnAlleleIds .get (i ));
1229+ maybeWriteVariantLine (queuedLines , vc , a .getBaseString (), "ClinVar" , diseaseSplit .get (j ), description , overlappingGenes , omimIds , omimPhenotypes , ctx .getLogger (), "ClinVar:" + clnAlleleIds .get (i ));
12181230
12191231 }
12201232 catch (IndexOutOfBoundsException e )
@@ -1300,6 +1312,15 @@ private void inspectAndSummarizeVcf(JobContext ctx, File vcfInput, GeneToNameTra
13001312
13011313 generateSummaries (ctx , vcfInput , genome , totalVariants , totalPrivateVariants , totalSubjects , typeCounts );
13021314 }
1315+
1316+ try
1317+ {
1318+ Files .touch (doneFile );
1319+ }
1320+ catch (IOException e )
1321+ {
1322+ throw new PipelineJobException (e );
1323+ }
13031324 }
13041325
13051326 public Collection <String > parseRawOmimPheno (VariantContext vc , Logger log )
0 commit comments