|
3 | 3 | import au.com.bytecode.opencsv.CSVReader; |
4 | 4 | import au.com.bytecode.opencsv.CSVWriter; |
5 | 5 | import org.apache.commons.io.FileUtils; |
| 6 | +import org.apache.commons.lang3.StringUtils; |
| 7 | +import org.labkey.api.data.Container; |
| 8 | +import org.labkey.api.data.SimpleFilter; |
| 9 | +import org.labkey.api.data.Sort; |
| 10 | +import org.labkey.api.data.TableSelector; |
| 11 | +import org.labkey.api.pipeline.PipelineJob; |
6 | 12 | import org.labkey.api.pipeline.PipelineJobException; |
| 13 | +import org.labkey.api.query.FieldKey; |
| 14 | +import org.labkey.api.query.QueryService; |
7 | 15 | import org.labkey.api.reader.Readers; |
8 | 16 | import org.labkey.api.sequenceanalysis.SequenceOutputFile; |
9 | 17 | import org.labkey.api.sequenceanalysis.pipeline.AbstractPipelineStepProvider; |
10 | 18 | import org.labkey.api.sequenceanalysis.pipeline.PipelineContext; |
11 | 19 | import org.labkey.api.sequenceanalysis.pipeline.SequenceOutputHandler; |
12 | 20 | import org.labkey.api.singlecell.pipeline.SingleCellStep; |
| 21 | +import org.labkey.api.util.PageFlowUtil; |
13 | 22 | import org.labkey.api.writer.PrintWriters; |
| 23 | +import org.labkey.singlecell.SingleCellSchema; |
14 | 24 |
|
15 | 25 | import java.io.File; |
16 | 26 | import java.io.IOException; |
17 | 27 | import java.util.ArrayList; |
18 | 28 | import java.util.Arrays; |
| 29 | +import java.util.HashMap; |
| 30 | +import java.util.HashSet; |
19 | 31 | import java.util.List; |
| 32 | +import java.util.Map; |
| 33 | +import java.util.Set; |
20 | 34 |
|
21 | 35 | import static org.labkey.singlecell.analysis.ProcessSingleCellHandler.LOUPE_TYPE; |
22 | 36 |
|
@@ -48,20 +62,72 @@ public void init(SequenceOutputHandler.JobContext ctx, List<SequenceOutputFile> |
48 | 62 | { |
49 | 63 | try (CSVWriter writer = new CSVWriter(PrintWriters.getPrintWriter(getMolInfoTable(ctx)))) |
50 | 64 | { |
| 65 | + Map<Integer, SequenceOutputFile> loupeOutputs = new HashMap<>(); |
51 | 66 | for (SequenceOutputFile so : inputFiles) |
52 | 67 | { |
53 | 68 | if (!LOUPE_TYPE.isType(so.getFile())) |
54 | 69 | { |
55 | | - throw new PipelineJobException("All input files must be loupe files to use sequence saturation"); |
| 70 | + File meta = new File(so.getFile().getPath().replaceAll(".rds", ".cellBarcodes.csv")); |
| 71 | + if (!meta.exists()) |
| 72 | + { |
| 73 | + throw new PipelineJobException("Cannot find expected metadata file: " + meta.getPath()); |
| 74 | + } |
| 75 | + |
| 76 | + Set<Integer> uniqueIds = new HashSet<>(); |
| 77 | + try (CSVReader reader = new CSVReader(Readers.getReader(meta), '_')) |
| 78 | + { |
| 79 | + String[] line; |
| 80 | + while ((line = reader.readNext()) != null) |
| 81 | + { |
| 82 | + if (line.length != 2) |
| 83 | + { |
| 84 | + throw new PipelineJobException("Unexpected barcode line: " + StringUtils.join(line, "_")); |
| 85 | + } |
| 86 | + |
| 87 | + try |
| 88 | + { |
| 89 | + uniqueIds.add(Integer.parseInt(line[0])); |
| 90 | + } |
| 91 | + catch (NumberFormatException e) |
| 92 | + { |
| 93 | + throw new PipelineJobException("Non-numeric barcode prefix: " + StringUtils.join(line, "_")); |
| 94 | + } |
| 95 | + } |
| 96 | + } |
| 97 | + |
| 98 | + for (Integer rowId : uniqueIds) |
| 99 | + { |
| 100 | + SequenceOutputFile loupeObj = SequenceOutputFile.getForId(rowId); |
| 101 | + if (loupeObj == null) |
| 102 | + { |
| 103 | + throw new PipelineJobException("Unable to find loupe output file with ID: " + rowId); |
| 104 | + } |
| 105 | + loupeOutputs.put(rowId, loupeObj); |
| 106 | + } |
56 | 107 | } |
| 108 | + else |
| 109 | + { |
| 110 | + loupeOutputs.put(so.getRowid(), so); |
| 111 | + } |
| 112 | + } |
57 | 113 |
|
58 | | - File molInfo = new File(so.getFile().getParentFile(), "molecule_info.h5"); |
| 114 | + for (Integer rowId : loupeOutputs.keySet()) |
| 115 | + { |
| 116 | + SequenceOutputFile loupeFile = loupeOutputs.get(rowId); |
| 117 | + File molInfo = new File(loupeFile.getFile().getParentFile(), "molecule_info.h5"); |
59 | 118 | if (!molInfo.exists()) |
60 | 119 | { |
61 | 120 | throw new PipelineJobException("Cannot find file: " + molInfo.getPath()); |
62 | 121 | } |
63 | 122 |
|
64 | | - writer.writeNext(new String[]{String.valueOf(so.getRowid()), molInfo.getPath()}); |
| 123 | + if (loupeFile.getReadset() == null) |
| 124 | + { |
| 125 | + throw new PipelineJobException("Loupe file lacks a readset: " + rowId); |
| 126 | + } |
| 127 | + |
| 128 | + writer.writeNext(new String[]{String.valueOf(loupeFile.getRowid()), molInfo.getPath(), "RNA"}); |
| 129 | + |
| 130 | + findAdditionalData(loupeFile, writer, ctx.getJob()); |
65 | 131 | } |
66 | 132 | } |
67 | 133 | catch (IOException e) |
@@ -93,14 +159,15 @@ protected List<Chunk> getChunks(SequenceOutputHandler.JobContext ctx) throws Pip |
93 | 159 | while ((line = reader.readNext()) != null) |
94 | 160 | { |
95 | 161 | File source = new File(line[1]); |
96 | | - File dest = new File(ctx.getWorkingDirectory(), line[0] + ".molInfo.h5"); |
| 162 | + String assay = line[2]; |
| 163 | + File dest = new File(ctx.getWorkingDirectory(), line[0] + "." + assay + ".molInfo.h5"); |
97 | 164 | if (dest.exists()) |
98 | 165 | { |
99 | 166 | dest.delete(); |
100 | 167 | } |
101 | 168 | FileUtils.copyFile(source, dest); |
102 | 169 |
|
103 | | - lines.add("\t'" + line[0] + "' = '" + dest.getName() + "',"); |
| 170 | + lines.add("\t'" + line[0] + "-" + assay + "' = '" + dest.getName() + "',"); |
104 | 171 | ctx.getFileManager().addIntermediateFile(dest); |
105 | 172 | } |
106 | 173 | } |
@@ -128,4 +195,65 @@ public String getFileSuffix() |
128 | 195 | { |
129 | 196 | return "saturation"; |
130 | 197 | } |
| 198 | + |
| 199 | + private void findAdditionalData(SequenceOutputFile loupeFile, CSVWriter writer, PipelineJob job) throws IOException, PipelineJobException |
| 200 | + { |
| 201 | + Set<Integer> hashingReadsets = new HashSet<>(); |
| 202 | + Set<Integer> citeReadsets = new HashSet<>(); |
| 203 | + Container targetContainer = job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer(); |
| 204 | + TableSelector ts = new TableSelector(QueryService.get().getUserSchema(job.getUser(), targetContainer, SingleCellSchema.NAME).getTable(SingleCellSchema.TABLE_CDNAS), PageFlowUtil.set("hashingReadsetId", "citeseqReadsetId"), new SimpleFilter(FieldKey.fromString("readsetId"), loupeFile.getReadset()), null); |
| 205 | + ts.forEachResults(rs -> { |
| 206 | + if (rs.getObject(FieldKey.fromString("hashingReadsetId")) != null) |
| 207 | + { |
| 208 | + hashingReadsets.add(rs.getInt(FieldKey.fromString("hashingReadsetId"))); |
| 209 | + } |
| 210 | + |
| 211 | + if (rs.getObject(FieldKey.fromString("citeseqReadsetId")) != null) |
| 212 | + { |
| 213 | + citeReadsets.add(rs.getInt(FieldKey.fromString("citeseqReadsetId"))); |
| 214 | + } |
| 215 | + }); |
| 216 | + |
| 217 | + if (hashingReadsets.size() > 1) |
| 218 | + { |
| 219 | + throw new PipelineJobException("More than one hashing readset associated with GEX readset: " + loupeFile.getReadset()); |
| 220 | + } |
| 221 | + else if (hashingReadsets.size() == 1) |
| 222 | + { |
| 223 | + writeExtraData(loupeFile.getRowid(), hashingReadsets.iterator().next(), job, "Cell Hashing Counts", writer, "HTO"); |
| 224 | + } |
| 225 | + |
| 226 | + if (citeReadsets.size() > 1) |
| 227 | + { |
| 228 | + throw new PipelineJobException("More than one CITE-seq readset associated with GEX readset: " + loupeFile.getReadset()); |
| 229 | + } |
| 230 | + else if (citeReadsets.size() == 1) |
| 231 | + { |
| 232 | + writeExtraData(loupeFile.getRowid(), citeReadsets.iterator().next(), job, "CITE-seq Counts", writer, "ADT"); |
| 233 | + } |
| 234 | + } |
| 235 | + |
| 236 | + private void writeExtraData(int datasetId, int readsetId, PipelineJob job, String category, CSVWriter writer, String assayName) throws PipelineJobException |
| 237 | + { |
| 238 | + Container targetContainer = job.getContainer().isWorkbook() ? job.getContainer().getParent() : job.getContainer(); |
| 239 | + SimpleFilter filter = new SimpleFilter(FieldKey.fromString("readsetId"), readsetId); |
| 240 | + filter.addCondition(FieldKey.fromString("category"), category); |
| 241 | + |
| 242 | + List<Integer> rowIds = new TableSelector(QueryService.get().getUserSchema(job.getUser(), targetContainer, SingleCellSchema.SEQUENCE_SCHEMA_NAME).getTable("outputfiles"), PageFlowUtil.set("rowid"), filter, new Sort("-rowid")).getArrayList(Integer.class); |
| 243 | + if (!rowIds.isEmpty()) |
| 244 | + { |
| 245 | + if (rowIds.size() > 1) |
| 246 | + { |
| 247 | + job.getLogger().info("More than one " + assayName + " output found for " + readsetId + ", using the most recent: " + rowIds.get(0)); |
| 248 | + } |
| 249 | + |
| 250 | + File molInfo = new File(SequenceOutputFile.getForId(rowIds.get(0)).getFile().getParentFile().getParentFile(), "molecule_info.h5"); |
| 251 | + if (!molInfo.exists()) |
| 252 | + { |
| 253 | + throw new PipelineJobException("Cannot find file: " + molInfo.getPath()); |
| 254 | + } |
| 255 | + |
| 256 | + writer.writeNext(new String[]{String.valueOf(datasetId), molInfo.getPath(), assayName}); |
| 257 | + } |
| 258 | + } |
131 | 259 | } |
0 commit comments