|
1 | 1 | package org.labkey.singlecell.run; |
2 | 2 |
|
| 3 | +import au.com.bytecode.opencsv.CSVReader; |
| 4 | +import au.com.bytecode.opencsv.CSVWriter; |
3 | 5 | import org.apache.logging.log4j.Logger; |
4 | 6 | import org.jetbrains.annotations.Nullable; |
5 | 7 | import org.json.JSONObject; |
6 | 8 | import org.labkey.api.pipeline.PipelineJobException; |
| 9 | +import org.labkey.api.reader.Readers; |
7 | 10 | import org.labkey.api.sequenceanalysis.model.Readset; |
8 | 11 | import org.labkey.api.sequenceanalysis.pipeline.AbstractAlignmentStepProvider; |
9 | 12 | import org.labkey.api.sequenceanalysis.pipeline.AlignmentOutputImpl; |
|
15 | 18 | import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor; |
16 | 19 | import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper; |
17 | 20 | import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper; |
| 21 | +import org.labkey.api.util.FileUtil; |
18 | 22 | import org.labkey.api.util.PageFlowUtil; |
| 23 | +import org.labkey.api.writer.PrintWriters; |
19 | 24 |
|
20 | 25 | import java.io.File; |
| 26 | +import java.io.IOException; |
21 | 27 | import java.util.ArrayList; |
22 | 28 | import java.util.Arrays; |
23 | 29 | import java.util.LinkedHashSet; |
@@ -98,6 +104,40 @@ public VelocytoWrapper(Logger log) |
98 | 104 |
|
99 | 105 | public File runVelocytoFor10x(File localBam, File gtf, File outputFolder, @Nullable File mask, Readset rs) throws PipelineJobException |
100 | 106 | { |
| 107 | + getLogger().debug("Inspecting GTF for lines without gene_id or transcript_id"); |
| 108 | + int linesDropped = 0; |
| 109 | + File gtfEdit = new File(outputFolder, FileUtil.getBaseName(gtf) + ".geneId.gtf"); |
| 110 | + try (CSVReader reader = new CSVReader(Readers.getReader(gtf), '\t', CSVWriter.NO_QUOTE_CHARACTER); CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(gtfEdit), '\t', CSVWriter.NO_QUOTE_CHARACTER, CSVWriter.NO_ESCAPE_CHARACTER)) |
| 111 | + { |
| 112 | + String[] line; |
| 113 | + while ((line = reader.readNext()) != null) |
| 114 | + { |
| 115 | + //Drop lines lacking gene_id/transcript, or with empty gene_id: |
| 116 | + if (!line[0].startsWith("#") && (!line[8].contains("gene_id") || !line[8].contains("transcript_id") || line[8].contains("gene_id \"\"") || line[8].contains("transcript_id \"\""))) |
| 117 | + { |
| 118 | + linesDropped++; |
| 119 | + continue; |
| 120 | + } |
| 121 | + |
| 122 | + writer.writeNext(line); |
| 123 | + } |
| 124 | + } |
| 125 | + catch (IOException e) |
| 126 | + { |
| 127 | + throw new PipelineJobException(e); |
| 128 | + } |
| 129 | + |
| 130 | + if (linesDropped == 0) |
| 131 | + { |
| 132 | + getLogger().debug("No GTF lines were invalid, using original"); |
| 133 | + gtfEdit.delete(); |
| 134 | + } |
| 135 | + else |
| 136 | + { |
| 137 | + getLogger().info("dropped " + linesDropped + " lines lacking gene_id, transcript_id, or with an empty value for gene_id/transcript_id"); |
| 138 | + gtf = gtfEdit; |
| 139 | + } |
| 140 | + |
101 | 141 | // https://velocyto.org/velocyto.py/tutorial/cli.html#run10x-run-on-10x-chromium-samples |
102 | 142 | // velocyto run10x -m repeat_msk.gtf mypath/sample01 somepath/refdata-cellranger-mm10-1.2.0/genes/genes.gtf |
103 | 143 | // velocyto run -b filtered_barcodes.tsv -o output_path -m repeat_msk_srt.gtf possorted_genome_bam.bam mm10_annotation.gtf |
@@ -141,6 +181,11 @@ public File runVelocytoFor10x(File localBam, File gtf, File outputFolder, @Nulla |
141 | 181 |
|
142 | 182 | wrapper.execute(args); |
143 | 183 |
|
| 184 | + if (gtfEdit.exists()) |
| 185 | + { |
| 186 | + gtfEdit.delete(); |
| 187 | + } |
| 188 | + |
144 | 189 | File loom = new File(outputFolder, sampleName + ".loom"); |
145 | 190 | if (!loom.exists()) |
146 | 191 | { |
|
0 commit comments