@@ -108,103 +108,70 @@ private class SiteAndAlleles
108108 {
109109 private final String _contig ;
110110 private final int _start ;
111+ private final Allele _ref ;
111112
112- Map <Allele , List <Allele >> _encounteredAlleles = new HashMap <>();
113-
114- Allele _ref = null ;
113+ Map <String , Map <Allele , Allele >> _encounteredAlleles = new HashMap <>();
115114 List <Allele > _alternates ;
116- Map <Allele , Map <Allele , Allele >> _renamedAlleles ;
117115
118- public SiteAndAlleles (String contig , int start )
116+ public SiteAndAlleles (String contig , int start , Allele ref )
119117 {
120118 _contig = contig ;
121119 _start = start ;
120+ _ref = ref ;
122121 }
123122
124- public void addSite (VariantContext vc )
123+ public Allele getRenamedAllele (VariantContext vc , Allele toCheck )
125124 {
126- List <Allele > alleles = _encounteredAlleles .getOrDefault (vc .getReference (), new ArrayList <>());
127- vc .getAlternateAlleles ().forEach (a -> {
128- if (!alleles .contains (a ))
129- {
130- alleles .add (a );
131- }
132- });
133- _encounteredAlleles .put (vc .getReference (), alleles );
125+ Allele ret = _encounteredAlleles .get (getEncounteredKey (vc .getStart (), vc .getReference ())).get (toCheck );
126+
127+ return ret == null ? toCheck : ret ;
134128 }
135129
136- public void doFinalize ( )
130+ private String getEncounteredKey ( int start , Allele ref )
137131 {
138- if (_ref != null )
139- {
140- throw new IllegalArgumentException ("Has already been finalized!" );
141- }
142-
143- if (_encounteredAlleles .keySet ().size () == 1 )
144- {
145- _ref = _encounteredAlleles .keySet ().iterator ().next ();
146- _alternates = Collections .unmodifiableList (_encounteredAlleles .get (_ref ));
147- _renamedAlleles = Collections .emptyMap ();
148-
149- return ;
150- }
151-
152- Allele finalRef = null ;
153- for (Allele ref : _encounteredAlleles .keySet ())
154- {
155- if (finalRef == null )
156- {
157- finalRef = ref ;
158- }
159- else
160- {
161- finalRef = determineReferenceAllele (finalRef , ref );
162- }
163- }
164- _ref = finalRef ;
165-
166- List <Allele > finalAlleles = new ArrayList <>();
167- _renamedAlleles = new HashMap <>();
132+ return start + ":" + ref .getBaseString ();
133+ }
168134
169- for (Allele ref : _encounteredAlleles .keySet ())
170- {
171- if (ref .equals (finalRef ))
172- {
173- finalAlleles .addAll (_encounteredAlleles .get (finalRef ));
174- }
175- else
135+ public void addSite (VariantContext vc )
136+ {
137+ Map <Allele , Allele > alleles = _encounteredAlleles .getOrDefault (getEncounteredKey (vc .getStart (), vc .getReference ()), new HashMap <>());
138+ vc .getAlternateAlleles ().forEach (a -> {
139+ Allele translated = extractAlleleForPosition (vc , a );
140+ if (translated != null )
176141 {
177- Map <Allele , Allele > alleleMap = createAlleleMapping (_ref , ref , _encounteredAlleles .get (ref ));
178- for (Allele a : _encounteredAlleles .get (ref ))
179- {
180- a = alleleMap .getOrDefault (a , a );
181- if (!finalAlleles .contains (a ))
182- {
183- finalAlleles .add (a );
184- }
185- }
142+ if (!alleles .containsKey (a ))
143+ alleles .put (a , translated );
186144
187- _renamedAlleles .put (ref , alleleMap );
145+ if (!_alternates .contains (translated ))
146+ _alternates .add (translated );
188147 }
189- }
190- _alternates = Collections . unmodifiableList ( finalAlleles );
148+ });
149+ _encounteredAlleles . put ( getEncounteredKey ( vc . getStart (), vc . getReference ()), alleles );
191150 }
192151
193152 public boolean isMergedRef ()
194153 {
195154 return _encounteredAlleles .size () > 1 ;
196155 }
197156
198- public void checkValid ( Logger log )
157+ protected Allele extractAlleleForPosition ( VariantContext vc , Allele a )
199158 {
200- if (_alternates .isEmpty ())
159+ int offset = vc .getStart () - _start ;
160+
161+ //deletion
162+ if (a .length () <= offset )
163+ {
164+ return Allele .SPAN_DEL ;
165+ }
166+ else if (offset == 0 && a .length () == 1 )
167+ {
168+ return a ;
169+ }
170+ else
201171 {
202- log .error ("No alternate alleles found: " + _contig + "/" + _start + "/" + _ref );
203- _encounteredAlleles .forEach ((a , b ) -> {
204- log .error (a .getBaseString () + ": " + b .stream ().map (Allele ::getBaseString ).collect (Collectors .joining ("," )));
205- });
172+ Allele ret = Allele .create (a .getBaseString ().substring (offset , offset + 1 ));
206173
207- throw new IllegalArgumentException ( "No alternate alleles found: " + _contig + "/" + _start + "/" + _ref ) ;
174+ return _ref . equals ( ret , true ) ? null : ret ;
208175 }
209176 }
210177 }
@@ -218,7 +185,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
218185 final double minAfThreshold = ctx .getParams ().optDouble (MIN_AF , 0.0 );
219186 final int minDepth = ctx .getParams ().optInt (MIN_COVERAGE , 0 );
220187
221- ctx .getLogger ().info ("Building whitelist of sites and alleles " );
188+ ctx .getLogger ().info ("Pass 1: Building whitelist of sites" );
222189 Map <String , SiteAndAlleles > siteToAllele = new HashMap <>();
223190 List <Pair <String , Integer >> whitelistSites = new ArrayList <>();
224191
@@ -250,15 +217,18 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
250217 double af = vc .getAttributeAsDouble ("AF" , 0.0 );
251218 if (af >= minAfThreshold )
252219 {
253- String key = getCacheKey (vc .getContig (), vc .getStart ());
254- SiteAndAlleles site = siteToAllele .containsKey (key ) ? siteToAllele .get (key ) : new SiteAndAlleles (vc .getContig (), vc .getStart ());
255- site .addSite (vc );
256-
257- if (!siteToAllele .containsKey (key ))
220+ for (int i = 0 ; i < vc .getLengthOnReference ();i ++)
258221 {
259- whitelistSites .add (Pair .of (vc .getContig (), vc .getStart ()));
222+ int effectiveStart = vc .getStart () + i ;
223+ String key = getCacheKey (vc .getContig (), effectiveStart );
224+ Allele ref = vc .getLengthOnReference () == 1 ? vc .getReference () : Allele .create (vc .getReference ().getBaseString ().substring (i , i + 1 ), true );
225+ SiteAndAlleles site = siteToAllele .containsKey (key ) ? siteToAllele .get (key ) : new SiteAndAlleles (vc .getContig (), effectiveStart , ref );
226+ if (!siteToAllele .containsKey (key ))
227+ {
228+ whitelistSites .add (Pair .of (vc .getContig (), effectiveStart ));
229+ }
230+ siteToAllele .put (key , site );
260231 }
261- siteToAllele .put (key , site );
262232 }
263233 }
264234 }
@@ -267,10 +237,31 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
267237 ctx .getLogger ().info ("total sites: " + whitelistSites .size ());
268238 Collections .sort (whitelistSites );
269239
270- siteToAllele .forEach ((x , y ) -> {
271- y .doFinalize ();
272- y .checkValid (ctx .getLogger ());
273- });
240+ ctx .getLogger ().info ("Pass 2: establish alleles per site" );
241+ for (SequenceOutputFile so : inputFiles )
242+ {
243+ try (VCFFileReader reader = new VCFFileReader (so .getFile ()))
244+ {
245+ for (Pair <String , Integer > site : whitelistSites )
246+ {
247+ String key = getCacheKey (site .getLeft (), site .getRight ());
248+
249+ //NOTE: LoFreq should output one VCF line per allele
250+ //NOTE: deletions spanning this site can also be included, with a start position ahead of this.
251+ try (CloseableIterator <VariantContext > it = reader .query (site .getLeft (), site .getRight (), site .getRight ()))
252+ {
253+ VariantContext vc = it .next ();
254+ if (vc .getAttribute ("AF" ) == null )
255+ {
256+ continue ;
257+ }
258+
259+ //NOTE: the start position of this SiteAndAlleles might differ from the VC
260+ siteToAllele .get (key ).addSite (vc );
261+ }
262+ }
263+ }
264+ }
274265
275266 ReferenceGenome genome = ctx .getSequenceSupport ().getCachedGenome (genomeIds .iterator ().next ());
276267 SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor .extractDictionary (genome .getSequenceDictionary ().toPath ());
@@ -342,20 +333,11 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
342333 {
343334 VariantContext vc = it .next ();
344335
345- //This could occur with a deletion from the prior position
346- if (vc .getStart () < siteDef ._start )
347- {
348- ctx .getLogger ().info ("Variant start less than site def, probably indicates an upstream deletion: " + siteDef ._start + " / " + vc .getStart () + " / " + so .getFile ().getPath ());
349- continue ;
350- }
351- else if (vc .getStart () > siteDef ._start )
336+ if (vc .getStart () > siteDef ._start )
352337 {
353338 throw new PipelineJobException ("Unexpected variant start. site: " + siteDef ._start + " / vc: " + vc .getStart () + " / " + so .getFile ().getPath ());
354339 }
355340
356- refs .add (vc .getReference ());
357- Map <Allele , Allele > alleleRenameMap = siteDef ._renamedAlleles .getOrDefault (vc .getReference (), Collections .emptyMap ());
358-
359341 if (vc .getAttribute ("GATK_DP" ) == null )
360342 {
361343 throw new PipelineJobException ("Expected GATK_DP annotation on line " + key + " in file: " + so .getFile ().getPath ());
@@ -370,7 +352,12 @@ else if (vc.getStart() > siteDef._start)
370352 if (depth < minDepth )
371353 {
372354 vc .getAlternateAlleles ().forEach (a -> {
373- alleleToAf .put (a , NO_DATA_VAL );
355+ Allele translatedAllele = siteDef .getRenamedAllele (vc , a );
356+ if (translatedAllele != null )
357+ {
358+ double val = alleleToAf .getOrDefault (a , NO_DATA_VAL );
359+ alleleToAf .put (a , val );
360+ }
374361 });
375362 }
376363 else
@@ -385,9 +372,15 @@ else if (vc.getStart() > siteDef._start)
385372 vc .getAlternateAlleles ().forEach (a -> {
386373 int aIdx = vc .getAlternateAlleles ().indexOf (a );
387374
388- //If the alleles were renamed due to combining references of differing length, account for this:
389- a = alleleRenameMap .getOrDefault (a , a );
390- alleleToAf .put (a , afs .get (aIdx ));
375+ a = siteDef .getRenamedAllele (vc , a );
376+ double val = alleleToAf .getOrDefault (a , 0.0 );
377+ if (val == NO_DATA_VAL )
378+ {
379+ val = 0 ;
380+ }
381+
382+ val = val + afs .get (aIdx );
383+ alleleToAf .put (a , val );
391384 });
392385 }
393386 }
@@ -413,10 +406,10 @@ else if (vc.getStart() > siteDef._start)
413406 refs .forEach (r -> {
414407 if (!siteDef ._encounteredAlleles .containsKey (r ))
415408 {
416- throw new IllegalArgumentException ("Reference not found: " + r .getBaseString () + ", at site: " + siteDef ._start + ", allowable : " + refs .stream ().map (Allele ::getBaseString ).collect (Collectors .joining ("," )));
409+ throw new IllegalArgumentException ("Reference not found: " + r .toString () + ", at site: " + siteDef ._start + ", all refs : " + refs .stream ().map (Allele ::toString ).collect (Collectors .joining ("," )) + ", allowed: " + StringUtils . join ( siteDef . _encounteredAlleles . keySet (), "," ));
417410 }
418411
419- alleleSets .add (siteDef ._encounteredAlleles .get (r ).stream (). map ( Allele :: getBaseString ). collect ( Collectors . joining ( "," ) ));
412+ alleleSets .add (StringUtils . join ( siteDef ._encounteredAlleles .get (r ).keySet (), "," ));
420413 });
421414 toWrite .add (alleleSets .stream ().collect (Collectors .joining (";" )));
422415 }
0 commit comments