6060public class AnnotationStep extends AbstractCommandPipelineStep <CassandraRunner > implements VariantProcessingStep
6161{
6262 public static final String GRCH37 = "genome37" ;
63+ private static final String CLINVAR_VCF = "clinvar37" ;
6364 private static final String DBNSFP_FILE = "dbnsfpFile" ;
6465
6566 public static final String CHAIN_FILE = "CHAIN_FILE" ;
@@ -74,6 +75,10 @@ public static class Provider extends AbstractVariantProcessingStepProvider<Annot
7475 public Provider ()
7576 {
7677 super ("AnnotateVariants" , "Annotate VCF for mGAP" , "VCF Annotation" , "This will annotate an input NHP VCF using human annotations including funcotator and SnpSift. This jobs will automatically look for chain files based on the source VCF genome and GRCh37/38 targets and will fail if these are not found." , Arrays .asList (
78+ ToolParameterDescriptor .createExpDataParam (CLINVAR_VCF , "Clinvar 2.0 VCF (GRCh37)" , "This is the DataId of the VCF containing human Clinvar variants, which should use the GRCh37 genome. After liftover of the rhesus data, any matching variants are annotated." , "ldk-expdatafield" , new JSONObject ()
79+ {{
80+ put ("allowBlank" , false );
81+ }}, null ),
7782 ToolParameterDescriptor .createExpDataParam (DBNSFP_FILE , "dbNSFP Database (GRCh37)" , "This is the DataId of the dbNSFP database (txt.gz file) using the GRCh37 genome." , "ldk-expdatafield" , new JSONObject ()
7883 {{
7984 put ("allowBlank" , false );
@@ -205,6 +210,12 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
205210 {
206211 VariantProcessingStepOutputImpl output = new VariantProcessingStepOutputImpl ();
207212
213+ File clinvarVCF = getPipelineCtx ().getSequenceSupport ().getCachedData (getProvider ().getParameterByName (CLINVAR_VCF ).extractValue (getPipelineCtx ().getJob (), getProvider (), getStepIdx (), Integer .class ));
214+ if (!clinvarVCF .exists ())
215+ {
216+ throw new PipelineJobException ("Unable to find file: " + clinvarVCF .getPath ());
217+ }
218+
208219 ReferenceGenome grch37Genome = getPipelineCtx ().getSequenceSupport ().getCachedGenome (getProvider ().getParameterByName (GRCH37 ).extractValue (getPipelineCtx ().getJob (), getProvider (), getStepIdx (), Integer .class ));
209220 Integer chainFileId = getPipelineCtx ().getSequenceSupport ().getCachedObject (CHAIN_FILE , Integer .class );
210221 File chainFile = getPipelineCtx ().getSequenceSupport ().getCachedData (chainFileId );
@@ -370,6 +381,38 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
370381 output .addIntermediateFile (liftedToGRCh37 );
371382 output .addIntermediateFile (new File (liftedToGRCh37 .getPath () + ".tbi" ));
372383
384+ //annotate with clinvar
385+ getPipelineCtx ().getLogger ().info ("annotating with ClinVar 2.0" );
386+ File clinvarAnnotated = new File (outputDirectory , SequenceAnalysisService .get ().getUnzippedBaseName (liftedToGRCh37 .getName ()) + ".cv.vcf.gz" );
387+ if (forceRecreate || !indexExists (clinvarAnnotated ))
388+ {
389+ ClinvarAnnotatorRunner cvRunner = new ClinvarAnnotatorRunner (getPipelineCtx ().getLogger ());
390+ cvRunner .execute (liftedToGRCh37 , clinvarVCF , clinvarAnnotated );
391+ }
392+ else
393+ {
394+ getPipelineCtx ().getLogger ().info ("resuming with existing file: " + clinvarAnnotated .getPath ());
395+ }
396+ output .addOutput (clinvarAnnotated , "VCF Annotated With ClinVar2.0" );
397+ output .addIntermediateFile (clinvarAnnotated );
398+ output .addIntermediateFile (new File (clinvarAnnotated .getPath () + ".tbi" ));
399+
400+ //backport ClinVar
401+ getPipelineCtx ().getLogger ().info ("backport ClinVar 2.0 to source genome" );
402+ File clinvarAnnotatedBackport = new File (outputDirectory , SequenceAnalysisService .get ().getUnzippedBaseName (clinvarAnnotated .getName ()) + ".bp.vcf.gz" );
403+ if (forceRecreate || !indexExists (clinvarAnnotatedBackport ))
404+ {
405+ BackportLiftedVcfRunner bpRunner = new BackportLiftedVcfRunner (getPipelineCtx ().getLogger ());
406+ bpRunner .execute (clinvarAnnotated , originalGenome .getWorkingFastaFile (), grch37Genome .getWorkingFastaFile (), clinvarAnnotatedBackport );
407+ }
408+ else
409+ {
410+ getPipelineCtx ().getLogger ().info ("resuming with existing file: " + clinvarAnnotatedBackport .getPath ());
411+ }
412+ output .addOutput (clinvarAnnotatedBackport , "VCF Annotated With Clinvar, Backported" );
413+ output .addIntermediateFile (clinvarAnnotatedBackport );
414+ output .addIntermediateFile (new File (clinvarAnnotatedBackport .getPath () + ".tbi" ));
415+
373416 //annotate with SnpSift
374417 getPipelineCtx ().getLogger ().info ("annotating with SnpSift/dbnsfp" );
375418 File snpSiftAnnotated = new File (outputDirectory , SequenceAnalysisService .get ().getUnzippedBaseName (liftedToGRCh37 .getName ()) + ".snpSift.vcf.gz" );
@@ -543,7 +586,7 @@ public Output processVariants(File inputVCF, File outputDirectory, ReferenceGeno
543586 addToolFieldNames ("SnpSift" , "-ssf" , options , multiAnnotated .getParentFile (), output , liftFields , SOURCE_FIELDS );
544587 addToolFieldNames ("SnpSift" , "-rssf" , options , multiAnnotated .getParentFile (), output , liftFields , TARGET_FIELDS );
545588
546- maRunner .execute (inputVCF , cassandraAnnotatedBackport , liftoverRejects , funcotatorAnnotatedBackport , snpSiftAnnotatedBackport , multiAnnotated , options );
589+ maRunner .execute (inputVCF , cassandraAnnotatedBackport , clinvarAnnotatedBackport , liftoverRejects , funcotatorAnnotatedBackport , snpSiftAnnotatedBackport , multiAnnotated , options );
547590 }
548591 else
549592 {
0 commit comments