Skip to content

Commit dccdd4d

Browse files
committed
Velocyto will error if the GTF has empty geneId, so drop these lines before running
1 parent d5dac32 commit dccdd4d

File tree

1 file changed

+45
-0
lines changed

1 file changed

+45
-0
lines changed

singlecell/src/org/labkey/singlecell/run/VelocytoAlignmentStep.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package org.labkey.singlecell.run;
22

3+
import au.com.bytecode.opencsv.CSVReader;
4+
import au.com.bytecode.opencsv.CSVWriter;
35
import org.apache.logging.log4j.Logger;
46
import org.jetbrains.annotations.Nullable;
57
import org.json.JSONObject;
68
import org.labkey.api.pipeline.PipelineJobException;
9+
import org.labkey.api.reader.Readers;
710
import org.labkey.api.sequenceanalysis.model.Readset;
811
import org.labkey.api.sequenceanalysis.pipeline.AbstractAlignmentStepProvider;
912
import org.labkey.api.sequenceanalysis.pipeline.AlignmentOutputImpl;
@@ -15,9 +18,12 @@
1518
import org.labkey.api.sequenceanalysis.pipeline.ToolParameterDescriptor;
1619
import org.labkey.api.sequenceanalysis.run.AbstractCommandWrapper;
1720
import org.labkey.api.sequenceanalysis.run.SimpleScriptWrapper;
21+
import org.labkey.api.util.FileUtil;
1822
import org.labkey.api.util.PageFlowUtil;
23+
import org.labkey.api.writer.PrintWriters;
1924

2025
import java.io.File;
26+
import java.io.IOException;
2127
import java.util.ArrayList;
2228
import java.util.Arrays;
2329
import java.util.LinkedHashSet;
@@ -98,6 +104,40 @@ public VelocytoWrapper(Logger log)
98104

99105
public File runVelocytoFor10x(File localBam, File gtf, File outputFolder, @Nullable File mask, Readset rs) throws PipelineJobException
100106
{
107+
getLogger().debug("Inspecting GTF for lines without gene_id or transcript_id");
108+
int linesDropped = 0;
109+
File gtfEdit = new File(outputFolder, FileUtil.getBaseName(gtf) + ".geneId.gtf");
110+
try (CSVReader reader = new CSVReader(Readers.getReader(gtf), '\t', CSVWriter.NO_QUOTE_CHARACTER); CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(gtfEdit), '\t', CSVWriter.NO_QUOTE_CHARACTER, CSVWriter.NO_ESCAPE_CHARACTER))
111+
{
112+
String[] line;
113+
while ((line = reader.readNext()) != null)
114+
{
115+
//Drop lines lacking gene_id/transcript, or with empty gene_id:
116+
if (!line[0].startsWith("#") && (!line[8].contains("gene_id") || !line[8].contains("transcript_id") || line[8].contains("gene_id \"\"") || line[8].contains("transcript_id \"\"")))
117+
{
118+
linesDropped++;
119+
continue;
120+
}
121+
122+
writer.writeNext(line);
123+
}
124+
}
125+
catch (IOException e)
126+
{
127+
throw new PipelineJobException(e);
128+
}
129+
130+
if (linesDropped == 0)
131+
{
132+
getLogger().debug("No GTF lines were invalid, using original");
133+
gtfEdit.delete();
134+
}
135+
else
136+
{
137+
getLogger().info("dropped " + linesDropped + " lines lacking gene_id, transcript_id, or with an empty value for gene_id/transcript_id");
138+
gtf = gtfEdit;
139+
}
140+
101141
// https://velocyto.org/velocyto.py/tutorial/cli.html#run10x-run-on-10x-chromium-samples
102142
// velocyto run10x -m repeat_msk.gtf mypath/sample01 somepath/refdata-cellranger-mm10-1.2.0/genes/genes.gtf
103143
// velocyto run -b filtered_barcodes.tsv -o output_path -m repeat_msk_srt.gtf possorted_genome_bam.bam mm10_annotation.gtf
@@ -141,6 +181,11 @@ public File runVelocytoFor10x(File localBam, File gtf, File outputFolder, @Nulla
141181

142182
wrapper.execute(args);
143183

184+
if (gtfEdit.exists())
185+
{
186+
gtfEdit.delete();
187+
}
188+
144189
File loom = new File(outputFolder, sampleName + ".loom");
145190
if (!loom.exists())
146191
{

0 commit comments

Comments
 (0)