Skip to content

Commit 166c3c9

Browse files
committed
When merging VCFs, split multi-site alleles
1 parent e49c9b3 commit 166c3c9

File tree

1 file changed

+92
-99
lines changed

1 file changed

+92
-99
lines changed

SequenceAnalysis/src/org/labkey/sequenceanalysis/run/analysis/MergeLoFreqVcfHandler.java

Lines changed: 92 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -108,103 +108,70 @@ private class SiteAndAlleles
108108
{
109109
private final String _contig;
110110
private final int _start;
111+
private final Allele _ref;
111112

112-
Map<Allele, List<Allele>> _encounteredAlleles = new HashMap<>();
113-
114-
Allele _ref = null;
113+
Map<String, Map<Allele, Allele>> _encounteredAlleles = new HashMap<>();
115114
List<Allele> _alternates;
116-
Map<Allele, Map<Allele, Allele>> _renamedAlleles;
117115

118-
public SiteAndAlleles(String contig, int start)
116+
public SiteAndAlleles(String contig, int start, Allele ref)
119117
{
120118
_contig = contig;
121119
_start = start;
120+
_ref = ref;
122121
}
123122

124-
public void addSite(VariantContext vc)
123+
public Allele getRenamedAllele(VariantContext vc, Allele toCheck)
125124
{
126-
List<Allele> alleles = _encounteredAlleles.getOrDefault(vc.getReference(), new ArrayList<>());
127-
vc.getAlternateAlleles().forEach(a -> {
128-
if (!alleles.contains(a))
129-
{
130-
alleles.add(a);
131-
}
132-
});
133-
_encounteredAlleles.put(vc.getReference(), alleles);
125+
Allele ret = _encounteredAlleles.get(getEncounteredKey(vc.getStart(), vc.getReference())).get(toCheck);
126+
127+
return ret == null ? toCheck : ret;
134128
}
135129

136-
public void doFinalize()
130+
private String getEncounteredKey(int start, Allele ref)
137131
{
138-
if (_ref != null)
139-
{
140-
throw new IllegalArgumentException("Has already been finalized!");
141-
}
142-
143-
if (_encounteredAlleles.keySet().size() == 1)
144-
{
145-
_ref = _encounteredAlleles.keySet().iterator().next();
146-
_alternates = Collections.unmodifiableList(_encounteredAlleles.get(_ref));
147-
_renamedAlleles = Collections.emptyMap();
148-
149-
return;
150-
}
151-
152-
Allele finalRef = null;
153-
for (Allele ref : _encounteredAlleles.keySet())
154-
{
155-
if (finalRef == null)
156-
{
157-
finalRef = ref;
158-
}
159-
else
160-
{
161-
finalRef = determineReferenceAllele(finalRef, ref);
162-
}
163-
}
164-
_ref = finalRef;
165-
166-
List<Allele> finalAlleles = new ArrayList<>();
167-
_renamedAlleles = new HashMap<>();
132+
return start + ":" + ref.getBaseString();
133+
}
168134

169-
for (Allele ref : _encounteredAlleles.keySet())
170-
{
171-
if (ref.equals(finalRef))
172-
{
173-
finalAlleles.addAll(_encounteredAlleles.get(finalRef));
174-
}
175-
else
135+
public void addSite(VariantContext vc)
136+
{
137+
Map<Allele, Allele> alleles = _encounteredAlleles.getOrDefault(getEncounteredKey(vc.getStart(), vc.getReference()), new HashMap<>());
138+
vc.getAlternateAlleles().forEach(a -> {
139+
Allele translated = extractAlleleForPosition(vc, a);
140+
if (translated != null)
176141
{
177-
Map<Allele, Allele> alleleMap = createAlleleMapping(_ref, ref, _encounteredAlleles.get(ref));
178-
for (Allele a : _encounteredAlleles.get(ref))
179-
{
180-
a = alleleMap.getOrDefault(a, a);
181-
if (!finalAlleles.contains(a))
182-
{
183-
finalAlleles.add(a);
184-
}
185-
}
142+
if (!alleles.containsKey(a))
143+
alleles.put(a, translated);
186144

187-
_renamedAlleles.put(ref, alleleMap);
145+
if (!_alternates.contains(translated))
146+
_alternates.add(translated);
188147
}
189-
}
190-
_alternates = Collections.unmodifiableList(finalAlleles);
148+
});
149+
_encounteredAlleles.put(getEncounteredKey(vc.getStart(), vc.getReference()), alleles);
191150
}
192151

193152
public boolean isMergedRef()
194153
{
195154
return _encounteredAlleles.size() > 1;
196155
}
197156

198-
public void checkValid(Logger log)
157+
protected Allele extractAlleleForPosition(VariantContext vc, Allele a)
199158
{
200-
if (_alternates.isEmpty())
159+
int offset = vc.getStart() - _start;
160+
161+
//deletion
162+
if (a.length() <= offset)
163+
{
164+
return Allele.SPAN_DEL;
165+
}
166+
else if (offset == 0 && a.length() == 1)
167+
{
168+
return a;
169+
}
170+
else
201171
{
202-
log.error("No alternate alleles found: " + _contig + "/" + _start + "/" + _ref);
203-
_encounteredAlleles.forEach((a, b) -> {
204-
log.error(a.getBaseString() + ": " + b.stream().map(Allele::getBaseString).collect(Collectors.joining(",")));
205-
});
172+
Allele ret = Allele.create(a.getBaseString().substring(offset, offset + 1));
206173

207-
throw new IllegalArgumentException("No alternate alleles found: " + _contig + "/" + _start + "/" + _ref);
174+
return _ref.equals(ret, true) ? null: ret;
208175
}
209176
}
210177
}
@@ -218,7 +185,7 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
218185
final double minAfThreshold = ctx.getParams().optDouble(MIN_AF, 0.0);
219186
final int minDepth = ctx.getParams().optInt(MIN_COVERAGE, 0);
220187

221-
ctx.getLogger().info("Building whitelist of sites and alleles");
188+
ctx.getLogger().info("Pass 1: Building whitelist of sites");
222189
Map<String, SiteAndAlleles> siteToAllele = new HashMap<>();
223190
List<Pair<String, Integer>> whitelistSites = new ArrayList<>();
224191

@@ -250,15 +217,18 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
250217
double af = vc.getAttributeAsDouble("AF", 0.0);
251218
if (af >= minAfThreshold)
252219
{
253-
String key = getCacheKey(vc.getContig(), vc.getStart());
254-
SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), vc.getStart());
255-
site.addSite(vc);
256-
257-
if (!siteToAllele.containsKey(key))
220+
for (int i = 0; i < vc.getLengthOnReference();i++)
258221
{
259-
whitelistSites.add(Pair.of(vc.getContig(), vc.getStart()));
222+
int effectiveStart = vc.getStart() + i;
223+
String key = getCacheKey(vc.getContig(), effectiveStart);
224+
Allele ref = vc.getLengthOnReference() == 1 ? vc.getReference() : Allele.create(vc.getReference().getBaseString().substring(i, i + 1), true);
225+
SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), effectiveStart, ref);
226+
if (!siteToAllele.containsKey(key))
227+
{
228+
whitelistSites.add(Pair.of(vc.getContig(), effectiveStart));
229+
}
230+
siteToAllele.put(key, site);
260231
}
261-
siteToAllele.put(key, site);
262232
}
263233
}
264234
}
@@ -267,10 +237,31 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
267237
ctx.getLogger().info("total sites: " + whitelistSites.size());
268238
Collections.sort(whitelistSites);
269239

270-
siteToAllele.forEach((x, y) -> {
271-
y.doFinalize();
272-
y.checkValid(ctx.getLogger());
273-
});
240+
ctx.getLogger().info("Pass 2: establish alleles per site");
241+
for (SequenceOutputFile so : inputFiles)
242+
{
243+
try (VCFFileReader reader = new VCFFileReader(so.getFile()))
244+
{
245+
for (Pair<String, Integer> site : whitelistSites)
246+
{
247+
String key = getCacheKey(site.getLeft(), site.getRight());
248+
249+
//NOTE: LoFreq should output one VCF line per allele
250+
//NOTE: deletions spanning this site can also be included, with a start position ahead of this.
251+
try (CloseableIterator<VariantContext> it = reader.query(site.getLeft(), site.getRight(), site.getRight()))
252+
{
253+
VariantContext vc = it.next();
254+
if (vc.getAttribute("AF") == null)
255+
{
256+
continue;
257+
}
258+
259+
//NOTE: the start position of this SiteAndAlleles might differ from the VC
260+
siteToAllele.get(key).addSite(vc);
261+
}
262+
}
263+
}
264+
}
274265

275266
ReferenceGenome genome = ctx.getSequenceSupport().getCachedGenome(genomeIds.iterator().next());
276267
SAMSequenceDictionary dict = SAMSequenceDictionaryExtractor.extractDictionary(genome.getSequenceDictionary().toPath());
@@ -342,20 +333,11 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c
342333
{
343334
VariantContext vc = it.next();
344335

345-
//This could occur with a deletion from the prior position
346-
if (vc.getStart() < siteDef._start)
347-
{
348-
ctx.getLogger().info("Variant start less than site def, probably indicates an upstream deletion: " + siteDef._start + " / " + vc.getStart() + " / " + so.getFile().getPath());
349-
continue;
350-
}
351-
else if (vc.getStart() > siteDef._start)
336+
if (vc.getStart() > siteDef._start)
352337
{
353338
throw new PipelineJobException("Unexpected variant start. site: " + siteDef._start + " / vc: " + vc.getStart() + " / " + so.getFile().getPath());
354339
}
355340

356-
refs.add(vc.getReference());
357-
Map<Allele, Allele> alleleRenameMap = siteDef._renamedAlleles.getOrDefault(vc.getReference(), Collections.emptyMap());
358-
359341
if (vc.getAttribute("GATK_DP") == null)
360342
{
361343
throw new PipelineJobException("Expected GATK_DP annotation on line " + key + " in file: " + so.getFile().getPath());
@@ -370,7 +352,12 @@ else if (vc.getStart() > siteDef._start)
370352
if (depth < minDepth)
371353
{
372354
vc.getAlternateAlleles().forEach(a -> {
373-
alleleToAf.put(a, NO_DATA_VAL);
355+
Allele translatedAllele = siteDef.getRenamedAllele(vc, a);
356+
if (translatedAllele != null)
357+
{
358+
double val = alleleToAf.getOrDefault(a, NO_DATA_VAL);
359+
alleleToAf.put(a, val);
360+
}
374361
});
375362
}
376363
else
@@ -385,9 +372,15 @@ else if (vc.getStart() > siteDef._start)
385372
vc.getAlternateAlleles().forEach(a -> {
386373
int aIdx = vc.getAlternateAlleles().indexOf(a);
387374

388-
//If the alleles were renamed due to combining references of differing length, account for this:
389-
a = alleleRenameMap.getOrDefault(a, a);
390-
alleleToAf.put(a, afs.get(aIdx));
375+
a = siteDef.getRenamedAllele(vc, a);
376+
double val = alleleToAf.getOrDefault(a, 0.0);
377+
if (val == NO_DATA_VAL)
378+
{
379+
val = 0;
380+
}
381+
382+
val = val + afs.get(aIdx);
383+
alleleToAf.put(a, val);
391384
});
392385
}
393386
}
@@ -413,10 +406,10 @@ else if (vc.getStart() > siteDef._start)
413406
refs.forEach(r -> {
414407
if (!siteDef._encounteredAlleles.containsKey(r))
415408
{
416-
throw new IllegalArgumentException("Reference not found: " + r.getBaseString() + ", at site: " + siteDef._start + ", allowable: " + refs.stream().map(Allele::getBaseString).collect(Collectors.joining(",")));
409+
throw new IllegalArgumentException("Reference not found: " + r.toString() + ", at site: " + siteDef._start + ", all refs: " + refs.stream().map(Allele::toString).collect(Collectors.joining(",")) + ", allowed: " + StringUtils.join(siteDef._encounteredAlleles.keySet(), ","));
417410
}
418411

419-
alleleSets.add(siteDef._encounteredAlleles.get(r).stream().map(Allele::getBaseString).collect(Collectors.joining(",")));
412+
alleleSets.add(StringUtils.join(siteDef._encounteredAlleles.get(r).keySet(), ","));
420413
});
421414
toWrite.add(alleleSets.stream().collect(Collectors.joining(";")));
422415
}

0 commit comments

Comments
 (0)