Skip to content

Commit 49efdb0

Browse files
committed
Improve CellRangerVDJ
1 parent 7020bf9 commit 49efdb0

File tree

1 file changed

+204
-50
lines changed

1 file changed

+204
-50
lines changed

singlecell/src/org/labkey/singlecell/run/CellRangerVDJWrapper.java

Lines changed: 204 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.labkey.api.util.PageFlowUtil;
4545
import org.labkey.api.writer.PrintWriters;
4646

47+
import java.io.BufferedReader;
4748
import java.io.File;
4849
import java.io.IOException;
4950
import java.io.PrintWriter;
@@ -382,88 +383,123 @@ public AlignmentStep.AlignmentOutput performAlignment(Readset rs, List<File> inp
382383
getWrapper().execute(args);
383384

384385
File outdir = new File(outputDirectory, id);
385-
outdir = new File(outdir, "multi");
386386
outdir = new File(outdir, "outs");
387387

388-
File abDir = new File(outdir, "vdj_t");
389-
File gdDir = new File(outdir, "vdj_t_gd");
390388

391-
File csvAB = processOutputsForType(rs, referenceGenome, abDir, output, "alpha/beta");
392-
File csvGD = processOutputsForType(rs, referenceGenome, gdDir, output, "gamma/delta");
389+
File csvAB = processOutputsForType(id, rs, referenceGenome, outdir, output, "vdj_t");
390+
File csvGD = processOutputsForType(id, rs, referenceGenome, outdir, output, "vdj_t_gd");
393391

394-
deleteSymlinks(localFqDir);
395-
396-
throw new PipelineJobException("This is under development!");
397-
//return output;
398-
}
392+
File combinedCSV = processAndMergeCSVs(csvAB, csvGD, getPipelineCtx().getLogger());
399393

400-
private File processOutputsForType(Readset rs, ReferenceGenome referenceGenome, File outdir, AlignmentOutputImpl output, String chainType) throws PipelineJobException
401-
{
402-
File bam = new File(outdir, "all_contig.bam");
403-
if (!bam.exists())
394+
//NOTE: this folder has many unnecessary files and symlinks that get corrupted when we rename the main outputs
395+
File directory = new File(outdir.getParentFile(), "SC_MULTI_CS");
396+
if (directory.exists())
404397
{
405-
throw new PipelineJobException("Unable to find file: " + bam.getPath());
398+
//NOTE: this will have lots of symlinks, including corrupted ones, which java handles badly
399+
new SimpleScriptWrapper(getPipelineCtx().getLogger()).execute(Arrays.asList("rm", "-Rf", directory.getPath()));
400+
}
401+
else
402+
{
403+
getPipelineCtx().getLogger().warn("Unable to find folder: " + directory.getPath());
406404
}
407-
output.setBAM(bam);
408405

409-
//now do cleanup/rename:
410406
try
411407
{
412-
String prefix = FileUtil.makeLegalName(rs.getName() + "_");
413-
File outputHtml = new File(outdir, "web_summary.html");
408+
File outputHtml = new File(outdir, "per_sample_outs/" + id + "/web_summary.html");
414409
if (!outputHtml.exists())
415410
{
416411
throw new PipelineJobException("Unable to find file: " + outputHtml.getPath());
417412
}
418413

419-
File outputHtmlRename = new File(outdir, prefix + outputHtml.getName());
414+
File outputHtmlRename = new File(outputHtml.getParentFile(), FileUtil.makeLegalName(rs.getName() + "_") + outputHtml.getName());
420415
if (outputHtmlRename.exists())
421416
{
422417
outputHtmlRename.delete();
423418
}
424419
FileUtils.moveFile(outputHtml, outputHtmlRename);
425420

426-
output.addSequenceOutput(outputHtmlRename, rs.getName() + " 10x VDJ Summary: " + chainType, "10x Run Summary", rs.getRowId(), null, referenceGenome.getGenomeId(), null);
421+
output.addSequenceOutput(outputHtmlRename, rs.getName() + " 10x VDJ Summary", "10x Run Summary", rs.getRowId(), null, referenceGenome.getGenomeId(), null);
422+
}
423+
catch (IOException e)
424+
{
425+
throw new PipelineJobException(e);
426+
}
427427

428-
File outputVloupe = new File(outdir, "vloupe.vloupe");
429-
File csv = new File(outdir, "all_contig_annotations.csv");
430-
if (!outputVloupe.exists())
428+
deleteSymlinks(localFqDir);
429+
430+
return output;
431+
}
432+
433+
private File processOutputsForType(String sampleId, Readset rs, ReferenceGenome referenceGenome, File outdir, AlignmentOutputImpl output, String subdirName) throws PipelineJobException
434+
{
435+
boolean isPrimaryDir = "vdj_t".equals(subdirName);
436+
String chainType = "vdj_t".equals(subdirName) ? "Alpha/Beta" : "Gamma/Delta";
437+
438+
File multiDir = new File(outdir, "multi/" + subdirName);
439+
if (!multiDir.exists())
440+
{
441+
throw new PipelineJobException("Missing folder: " + multiDir.getPath());
442+
}
443+
444+
File sampleDir = new File(outdir, "per_sample_outs/" + sampleId + "/" + subdirName);
445+
if (!sampleDir.exists())
446+
{
447+
throw new PipelineJobException("Missing folder: " + sampleDir.getPath());
448+
}
449+
450+
// For simplicity, consolidate all files:
451+
for (File f : multiDir.listFiles())
452+
{
453+
try
431454
{
432-
//NOTE: if there were no A/B hits, the vLoupe isnt created, but all other outputs exist
433-
if (!csv.exists())
434-
{
435-
throw new PipelineJobException("Unable to find file: " + outputVloupe.getPath());
436-
}
455+
FileUtils.moveFile(f, new File(sampleDir, f.getName()));
437456
}
438-
else
457+
catch (IOException e)
439458
{
440-
File outputVloupeRename = new File(outdir, prefix + outputVloupe.getName());
441-
if (outputVloupeRename.exists())
442-
{
443-
outputVloupeRename.delete();
444-
}
445-
FileUtils.moveFile(outputVloupe, outputVloupeRename);
446-
output.addSequenceOutput(outputVloupeRename, rs.getName() + " 10x VLoupe", "10x VLoupe", rs.getRowId(), null, referenceGenome.getGenomeId(), null);
459+
throw new PipelineJobException(e);
447460
}
461+
}
448462

449-
//NOTE: this folder has many unnecessary files and symlinks that get corrupted when we rename the main outputs
450-
File directory = new File(outdir.getParentFile(), "SC_VDJ_ASSEMBLER_CS");
451-
if (directory.exists())
452-
{
453-
//NOTE: this will have lots of symlinks, including corrupted ones, which java handles badly
454-
new SimpleScriptWrapper(getPipelineCtx().getLogger()).execute(Arrays.asList("rm", "-Rf", directory.getPath()));
455-
}
456-
else
463+
File bam = new File(sampleDir, "all_contig.bam");
464+
if (!bam.exists())
465+
{
466+
throw new PipelineJobException("Unable to find file: " + bam.getPath());
467+
}
468+
469+
if (isPrimaryDir)
470+
{
471+
output.setBAM(bam);
472+
}
473+
else
474+
{
475+
getPipelineCtx().getLogger().debug("Deleting BAM: " + bam.getPath());
476+
bam.delete();
477+
new File(bam.getPath() + ".bai").delete();
478+
}
479+
480+
File allContigAnnotations = new File(sampleDir, "all_contig_annotations.csv");
481+
if (!allContigAnnotations.exists())
482+
{
483+
throw new PipelineJobException("Missing file: " + allContigAnnotations.getPath());
484+
}
485+
486+
File outputVloupe = new File(sampleDir, "vloupe.vloupe");
487+
File csv = new File(sampleDir, "all_contig_annotations.csv");
488+
if (!outputVloupe.exists())
489+
{
490+
//NOTE: if there were no A/B hits, the vLoupe isnt created, but all other outputs exist
491+
if (!csv.exists())
457492
{
458-
getPipelineCtx().getLogger().warn("Unable to find folder: " + directory.getPath());
493+
throw new PipelineJobException("Unable to find file: " + outputVloupe.getPath());
459494
}
460-
461-
return csv;
462495
}
463-
catch (IOException e)
496+
// NOTE: only tag the vloupe file for a/b:
497+
else if (isPrimaryDir)
464498
{
465-
throw new PipelineJobException(e);
499+
output.addSequenceOutput(outputVloupe, rs.getName() + " 10x VLoupe", "10x VLoupe", rs.getRowId(), null, referenceGenome.getGenomeId(), null);
466500
}
501+
502+
return csv;
467503
}
468504

469505
@Override
@@ -603,6 +639,7 @@ public void addMetrics(AnalysisModel model) throws PipelineJobException
603639
getPipelineCtx().getLogger().debug("adding 10x metrics");
604640

605641
// TODO: improve
642+
//File outputHtml = new File(outdir, "per_sample_outs/" + id + "/web_summary.html");
606643
File metrics = new File(model.getAlignmentFileObject().getParentFile(), "metrics_summary.csv");
607644
if (metrics.exists())
608645
{
@@ -741,4 +778,121 @@ protected File getExe()
741778
{
742779
return SequencePipelineService.get().getExeForPackage("CELLRANGERPATH", "cellranger");
743780
}
781+
782+
private static File processAndMergeCSVs(File abCSV, File gdCSV, Logger log) throws PipelineJobException
783+
{
784+
File output = new File(abCSV.getParentFile(), "all_contig_annotations_combined.csv");
785+
786+
try (PrintWriter writer = PrintWriters.getPrintWriter(output))
787+
{
788+
processCSV(writer, true, abCSV, log, Arrays.asList("TRA", "TRB"));
789+
processCSV(writer, false, gdCSV, log, Arrays.asList("TRD", "TRG"));
790+
}
791+
catch (IOException e)
792+
{
793+
throw new PipelineJobException(e);
794+
}
795+
796+
return output;
797+
}
798+
799+
private static void processCSV(PrintWriter writer, boolean printHeader, File inputCsv, Logger log, List<String> acceptableChains) throws IOException
800+
{
801+
log.info("Processing CSV: " + inputCsv.getPath());
802+
try (BufferedReader reader = Readers.getReader(inputCsv))
803+
{
804+
String line;
805+
int chimericCallsRecovered = 0;
806+
807+
int lineIdx = 0;
808+
while ((line = reader.readLine()) != null)
809+
{
810+
lineIdx++;
811+
if (lineIdx == 1)
812+
{
813+
if (printHeader)
814+
{
815+
writer.println(line);
816+
}
817+
818+
continue;
819+
}
820+
821+
//Infer correct chain from the V, J and C genes
822+
String[] tokens = line.split(",", -1); // -1 used to preserve trailing empty strings
823+
List<String> chains = new ArrayList<>();
824+
String vGeneChain = null;
825+
String jGeneChain = null;
826+
String cGeneChain = null;
827+
for (int idx : new Integer[]{6,8,9})
828+
{
829+
String val = StringUtils.trimToNull(tokens[idx]);
830+
if (val != null)
831+
{
832+
val = val.substring(0, 3);
833+
834+
chains.add(val);
835+
if (idx == 6)
836+
{
837+
vGeneChain = val;
838+
}
839+
if (idx == 8)
840+
{
841+
jGeneChain = val;
842+
}
843+
else if (idx == 9)
844+
{
845+
cGeneChain = val;
846+
}
847+
}
848+
}
849+
850+
Set<String> uniqueChains = new HashSet<>(chains);
851+
String originalChain = StringUtils.trimToNull(tokens[5]);
852+
853+
// Recover TRDV/TRAJ/TRAC:
854+
if (uniqueChains.size() > 1)
855+
{
856+
if (cGeneChain != null)
857+
{
858+
uniqueChains.clear();
859+
uniqueChains.add(cGeneChain);
860+
chimericCallsRecovered++;
861+
}
862+
else if (uniqueChains.size() == 2)
863+
{
864+
if ("TRD".equals(vGeneChain) && "TRA".equals(jGeneChain))
865+
{
866+
uniqueChains.clear();
867+
chimericCallsRecovered++;
868+
uniqueChains.add(vGeneChain);
869+
}
870+
if ("TRA".equals(vGeneChain) && "TRD".equals(jGeneChain))
871+
{
872+
uniqueChains.clear();
873+
chimericCallsRecovered++;
874+
uniqueChains.add(vGeneChain);
875+
}
876+
}
877+
}
878+
879+
if (uniqueChains.size() == 1)
880+
{
881+
String chain = uniqueChains.iterator().next();
882+
tokens[5] = chain;
883+
}
884+
else
885+
{
886+
log.error("Multiple chains detected [" + StringUtils.join(chains, ",")+ "], leaving original call alone: " + originalChain + ". " + tokens[6] + "/" + tokens[8] + "/" + tokens[9]);
887+
}
888+
889+
if (acceptableChains.contains(tokens[5]))
890+
{
891+
writer.println(StringUtils.join(tokens, ","));
892+
}
893+
}
894+
895+
log.info("\tChimeric calls recovered: " + chimericCallsRecovered);
896+
}
897+
}
744898
}

0 commit comments

Comments
 (0)