|
1 | 1 | package org.labkey.sequenceanalysis.run.analysis; |
2 | 2 |
|
| 3 | +import au.com.bytecode.opencsv.CSVReader; |
3 | 4 | import au.com.bytecode.opencsv.CSVWriter; |
4 | 5 | import htsjdk.samtools.SAMSequenceDictionary; |
5 | 6 | import htsjdk.samtools.SAMSequenceRecord; |
|
20 | 21 | import org.labkey.api.pipeline.PipelineJob; |
21 | 22 | import org.labkey.api.pipeline.PipelineJobException; |
22 | 23 | import org.labkey.api.pipeline.RecordedAction; |
| 24 | +import org.labkey.api.reader.Readers; |
| 25 | +import org.labkey.api.sequenceanalysis.SequenceAnalysisService; |
23 | 26 | import org.labkey.api.sequenceanalysis.SequenceOutputFile; |
24 | 27 | import org.labkey.api.sequenceanalysis.pipeline.AbstractParameterizedOutputHandler; |
25 | 28 | import org.labkey.api.sequenceanalysis.pipeline.ReferenceGenome; |
26 | 29 | import org.labkey.api.sequenceanalysis.pipeline.SequenceAnalysisJobSupport; |
27 | 30 | import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; |
28 | 31 | import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; |
| 32 | +import org.labkey.api.util.FileUtil; |
29 | 33 | import org.labkey.api.util.PageFlowUtil; |
30 | 34 | import org.labkey.api.util.StringUtilsLabKey; |
31 | 35 | import org.labkey.sequenceanalysis.SequenceAnalysisModule; |
@@ -225,69 +229,112 @@ public void processFilesRemote(List<SequenceOutputFile> inputFiles, JobContext c |
225 | 229 |
|
226 | 230 | Set<Integer> genomeIds = new HashSet<>(); |
227 | 231 |
|
228 | | - for (SequenceOutputFile so : inputFiles) |
| 232 | + File indelOutput = new File(ctx.getOutputDir(), basename + "indels.txt.gz"); |
| 233 | + try (CSVWriter writer = new CSVWriter(new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(indelOutput)), StringUtilsLabKey.DEFAULT_CHARSET)), '\t', CSVWriter.NO_QUOTE_CHARACTER)) |
229 | 234 | { |
230 | | - //This will error if the coverage file is not found. Perform check now to fail fast |
231 | | - try |
232 | | - { |
233 | | - getDepthFile(so.getFile()); |
234 | | - } |
235 | | - catch (PipelineJobException e) |
236 | | - { |
237 | | - errors.add(e.getMessage()); |
238 | | - } |
| 235 | + writer.writeNext(new String[]{"ReadsetName", "OutputFileId", "ReadsetId", "Source", "Contig", "Start", "End", "Ref", "AltAllele", "GatkDepth", "LoFreqDepth", "AltCount", "AltAF"}); |
239 | 236 |
|
240 | | - if (so.getLibrary_id() == null) |
| 237 | + for (SequenceOutputFile so : inputFiles) |
241 | 238 | { |
242 | | - throw new PipelineJobException("VCF lacks library id: " + so.getRowid()); |
243 | | - } |
| 239 | + //This will error if the coverage file is not found. Perform check now to fail fast |
| 240 | + try |
| 241 | + { |
| 242 | + getDepthFile(so.getFile()); |
| 243 | + } |
| 244 | + catch (PipelineJobException e) |
| 245 | + { |
| 246 | + errors.add(e.getMessage()); |
| 247 | + } |
244 | 248 |
|
245 | | - genomeIds.add(so.getLibrary_id()); |
246 | | - if (genomeIds.size() > 1) |
247 | | - { |
248 | | - throw new PipelineJobException("Samples use more than one genome. Genome IDs: " + StringUtils.join(genomeIds, ",")); |
249 | | - } |
| 249 | + if (so.getLibrary_id() == null) |
| 250 | + { |
| 251 | + throw new PipelineJobException("VCF lacks library id: " + so.getRowid()); |
| 252 | + } |
250 | 253 |
|
251 | | - try (VCFFileReader reader = new VCFFileReader(so.getFile()); CloseableIterator<VariantContext> it = reader.iterator()) |
252 | | - { |
253 | | - while (it.hasNext()) |
| 254 | + genomeIds.add(so.getLibrary_id()); |
| 255 | + if (genomeIds.size() > 1) |
254 | 256 | { |
255 | | - VariantContext vc = it.next(); |
256 | | - if (vc.getAttribute("AF") == null) |
257 | | - { |
258 | | - continue; |
259 | | - } |
| 257 | + throw new PipelineJobException("Samples use more than one genome. Genome IDs: " + StringUtils.join(genomeIds, ",")); |
| 258 | + } |
260 | 259 |
|
261 | | - //Also perform santity check of VCF early |
262 | | - if (vc.getAttribute("GATK_DP") == null) |
| 260 | + try (VCFFileReader reader = new VCFFileReader(so.getFile()); CloseableIterator<VariantContext> it = reader.iterator()) |
| 261 | + { |
| 262 | + while (it.hasNext()) |
263 | 263 | { |
264 | | - errors.add("Expected GATK_DP annotation on line " + getCacheKey(vc.getContig(), vc.getStart()) + " in file: " + so.getFile().getPath()); |
265 | | - } |
| 264 | + VariantContext vc = it.next(); |
| 265 | + if (vc.getAttribute("AF") == null) |
| 266 | + { |
| 267 | + continue; |
| 268 | + } |
266 | 269 |
|
267 | | - double af = vc.getAttributeAsDouble("AF", 0.0); |
268 | | - if (af >= minAfThreshold) |
269 | | - { |
270 | | - if (vc.isIndel()) |
| 270 | + //Also perform santity check of VCF early |
| 271 | + if (vc.getAttribute("GATK_DP") == null) |
271 | 272 | { |
272 | | - uniqueIndels.add(vc.getContig() + "<>" + vc.getStart() + "<>" + vc.getReference().getBaseString() + "<>" + vc.getAlternateAlleles().get(0).getBaseString()); |
| 273 | + errors.add("Expected GATK_DP annotation on line " + getCacheKey(vc.getContig(), vc.getStart()) + " in file: " + so.getFile().getPath()); |
273 | 274 | } |
274 | 275 |
|
275 | | - for (int i = 0; i < vc.getLengthOnReference();i++) |
| 276 | + double af = vc.getAttributeAsDouble("AF", 0.0); |
| 277 | + if (af >= minAfThreshold) |
276 | 278 | { |
277 | | - int effectiveStart = vc.getStart() + i; |
278 | | - String key = getCacheKey(vc.getContig(), effectiveStart); |
279 | | - Allele ref = vc.getLengthOnReference() == 1 ? vc.getReference() : Allele.create(vc.getReference().getBaseString().substring(i, i + 1), true); |
280 | | - SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), effectiveStart, ref); |
281 | | - if (!siteToAllele.containsKey(key)) |
| 279 | + if (vc.isIndel()) |
282 | 280 | { |
283 | | - whitelistSites.add(Pair.of(vc.getContig(), effectiveStart)); |
| 281 | + uniqueIndels.add(vc.getContig() + "<>" + vc.getStart() + "<>" + vc.getReference().getBaseString() + "<>" + vc.getAlternateAlleles().get(0).getBaseString()); |
| 282 | + List<Integer> depths = vc.getAttributeAsIntList("DP4", 0); |
| 283 | + int alleleDepth = depths.get(2) + depths.get(3); |
| 284 | + |
| 285 | + writer.writeNext(new String[]{ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName(), String.valueOf(so.getRowid()), String.valueOf(so.getReadset()), "LoFreq", vc.getContig(), String.valueOf(vc.getStart()), String.valueOf(vc.getEnd()), vc.getReference().getBaseString(), vc.getAlternateAlleles().get(0).getBaseString(), vc.getAttributeAsString("GATK_DP", "ND"), vc.getAttributeAsString("DP", "ND"), String.valueOf(alleleDepth), vc.getAttributeAsString("AF", "ND")}); |
| 286 | + } |
| 287 | + |
| 288 | + for (int i = 0; i < vc.getLengthOnReference(); i++) |
| 289 | + { |
| 290 | + int effectiveStart = vc.getStart() + i; |
| 291 | + String key = getCacheKey(vc.getContig(), effectiveStart); |
| 292 | + Allele ref = vc.getLengthOnReference() == 1 ? vc.getReference() : Allele.create(vc.getReference().getBaseString().substring(i, i + 1), true); |
| 293 | + SiteAndAlleles site = siteToAllele.containsKey(key) ? siteToAllele.get(key) : new SiteAndAlleles(vc.getContig(), effectiveStart, ref); |
| 294 | + if (!siteToAllele.containsKey(key)) |
| 295 | + { |
| 296 | + whitelistSites.add(Pair.of(vc.getContig(), effectiveStart)); |
| 297 | + } |
| 298 | + siteToAllele.put(key, site); |
| 299 | + } |
| 300 | + } |
| 301 | + } |
| 302 | + } |
| 303 | + |
| 304 | + String pindelBasename = SequenceAnalysisService.get().getUnzippedBaseName(so.getFile().getName()); |
| 305 | + if (pindelBasename.endsWith("lofreq")) |
| 306 | + { |
| 307 | + pindelBasename = FileUtil.getBaseName(pindelBasename); |
| 308 | + } |
| 309 | + |
| 310 | + File pindelFile = new File(so.getFile().getParentFile(), pindelBasename + ".pindel.txt"); |
| 311 | + if (pindelFile.exists()) |
| 312 | + { |
| 313 | + try (CSVReader reader = new CSVReader(Readers.getReader(pindelFile))) |
| 314 | + { |
| 315 | + String[] line; |
| 316 | + while ((line = reader.readNext()) != null) |
| 317 | + { |
| 318 | + writer.writeNext(new String[]{"Type", "Contig", "Start", "End", "Depth", "ReadSupport", "Fraction"}); |
| 319 | + if (line[0].equals("Type")) |
| 320 | + { |
| 321 | + continue; |
284 | 322 | } |
285 | | - siteToAllele.put(key, site); |
| 323 | + |
| 324 | + writer.writeNext(new String[]{ctx.getSequenceSupport().getCachedReadset(so.getReadset()).getName(), String.valueOf(so.getRowid()), String.valueOf(so.getReadset()), "Pindel", line[1], line[2], line[3], "", line[0], line[4], "", line[5], line[6]}); |
286 | 325 | } |
287 | 326 | } |
288 | 327 | } |
| 328 | + else |
| 329 | + { |
| 330 | + ctx.getLogger().warn("Unable to find pindel file, expected: " + pindelFile.getPath()); |
| 331 | + } |
289 | 332 | } |
290 | 333 | } |
| 334 | + catch (IOException e) |
| 335 | + { |
| 336 | + throw new PipelineJobException(e); |
| 337 | + } |
291 | 338 |
|
292 | 339 | if (!errors.isEmpty()) |
293 | 340 | { |
|
0 commit comments