|
35 | 35 | import org.labkey.api.pipeline.PipelineJob; |
36 | 36 | import org.labkey.api.pipeline.PipelineJobException; |
37 | 37 | import org.labkey.api.test.TestWhen; |
| 38 | +import org.labkey.api.util.FileUtil; |
38 | 39 | import org.labkey.api.util.JunitUtil; |
39 | 40 | import org.labkey.api.util.Pair; |
40 | 41 | import org.labkey.api.util.TestContext; |
@@ -117,97 +118,123 @@ public Map<Pair<Integer, Integer>, FileInfo> parseFastqFiles(PipelineJob job) th |
117 | 118 | int index = 1; |
118 | 119 | for (File f : _files) |
119 | 120 | { |
120 | | - if(job != null) |
| 121 | + if (job != null) |
121 | 122 | job.setStatus("PARSING FILE " + index + " OF " + _files.size()); |
122 | 123 |
|
123 | | - if(f.length() == 0) |
| 124 | + long length = f.length(); |
| 125 | + if (length == 0) |
124 | 126 | { |
125 | 127 | _logger.info("File " + f.getName() + " has no content to parse."); |
126 | 128 | continue; |
127 | 129 | } |
128 | | - _logger.info("Beginning to parse file: " + f.getName()); |
129 | | - try (FastqReader reader = new FastqReader(f)) |
| 130 | + |
| 131 | + File tempFile = null; |
| 132 | + |
| 133 | + try |
130 | 134 | { |
131 | | - File targetDir = f.getParentFile(); |
132 | | - String fileName = f.getName(); |
133 | | - |
134 | | - int sampleIdx = Integer.MIN_VALUE; |
135 | | - String sampleName = null; |
136 | | - int pairNumber = Integer.MIN_VALUE; |
137 | | - int totalReads = 0; |
138 | | - while (reader.hasNext()) |
| 135 | + // Copy to a temp file for parsing for perf reasons. See issue 48029 |
| 136 | + // Ideally we'd avoid the copy when the file is already on a local file system, but there's no |
| 137 | + // good way to check if a file is truly local |
| 138 | + tempFile = FileUtil.createTempFile(FileUtil.getBaseName(f) + ".", "." + FileUtil.getExtension(f)); |
| 139 | + tempFile.deleteOnExit(); |
| 140 | + _logger.debug("Copying to temp file " + tempFile + ", size is " + f.length() + " bytes"); |
| 141 | + FileUtil.copyFile(f, tempFile); |
| 142 | + |
| 143 | + _logger.info("Beginning to parse file: " + f.getName()); |
| 144 | + try (FastqReader reader = new FastqReader(tempFile)) |
139 | 145 | { |
140 | | - FastqRecord fq = reader.next(); |
141 | | - String header = fq.getReadName(); |
142 | | - IlluminaReadHeader parsedHeader = new IlluminaReadHeader(header, fileName); |
143 | | - if(parsedHeader.getSampleName() != null) // may be new header format, so let's try alternate lookup |
| 146 | + File targetDir = f.getParentFile(); |
| 147 | + String fileName = f.getName(); |
| 148 | + |
| 149 | + int sampleIdx = Integer.MIN_VALUE; |
| 150 | + String sampleName = null; |
| 151 | + int pairNumber = Integer.MIN_VALUE; |
| 152 | + int totalReads = 0; |
| 153 | + while (reader.hasNext()) |
144 | 154 | { |
145 | | - sampleName = parsedHeader.getSampleName(); |
146 | | - |
147 | | - // First try to resolve as a sample name |
148 | | - Integer sampleId = _sampleNameToIdMap.get(parsedHeader.getSampleName()); |
149 | | - if (sampleId == null) |
| 155 | + FastqRecord fq = reader.next(); |
| 156 | + String header = fq.getReadName(); |
| 157 | + IlluminaReadHeader parsedHeader = new IlluminaReadHeader(header, fileName); |
| 158 | + if (parsedHeader.getSampleName() != null) // may be new header format, so let's try alternate lookup |
150 | 159 | { |
151 | | - try |
152 | | - { |
153 | | - sampleId = Integer.parseInt(parsedHeader.getSampleName()); |
154 | | - } |
155 | | - catch(NumberFormatException e) |
156 | | - { |
157 | | - throw new PipelineJobException("Could not resolve sample ID for sample named '" + parsedHeader.getSampleName() + "'. Sample map is: " + _sampleNameToIdMap); |
158 | | - } |
159 | | - Integer sampleIndex = _sampleIdToIndexMap.get(sampleId); |
160 | | - if (sampleIndex == null) |
| 160 | + sampleName = parsedHeader.getSampleName(); |
| 161 | + |
| 162 | + // First try to resolve as a sample name |
| 163 | + Integer sampleId = _sampleNameToIdMap.get(parsedHeader.getSampleName()); |
| 164 | + if (sampleId == null) |
161 | 165 | { |
162 | | - throw new PipelineJobException("Could not resolve Sample Index for Sample ID: " + sampleId + ". Id to Index mapping is: " + _sampleIdToIndexMap); |
| 166 | + try |
| 167 | + { |
| 168 | + sampleId = Integer.parseInt(parsedHeader.getSampleName()); |
| 169 | + } |
| 170 | + catch (NumberFormatException e) |
| 171 | + { |
| 172 | + throw new PipelineJobException("Could not resolve sample ID for sample named '" + parsedHeader.getSampleName() + "'. Sample map is: " + _sampleNameToIdMap); |
| 173 | + } |
| 174 | + Integer sampleIndex = _sampleIdToIndexMap.get(sampleId); |
| 175 | + if (sampleIndex == null) |
| 176 | + { |
| 177 | + throw new PipelineJobException("Could not resolve Sample Index for Sample ID: " + sampleId + ". Id to Index mapping is: " + _sampleIdToIndexMap); |
| 178 | + } |
| 179 | + parsedHeader.setSampleNum(sampleIndex.intValue()); |
163 | 180 | } |
164 | | - parsedHeader.setSampleNum(sampleIndex.intValue()); |
165 | 181 | } |
| 182 | + if ((sampleIdx != Integer.MIN_VALUE && sampleIdx != parsedHeader.getSampleNum()) || |
| 183 | + (pairNumber != Integer.MIN_VALUE && pairNumber != parsedHeader.getPairNumber())) |
| 184 | + throw new IllegalStateException("Only one sample ID is allowed per fastq file."); |
| 185 | + sampleIdx = parsedHeader.getSampleNum(); |
| 186 | + pairNumber = parsedHeader.getPairNumber(); |
| 187 | + totalReads++; |
166 | 188 | } |
167 | | - if ((sampleIdx != Integer.MIN_VALUE && sampleIdx != parsedHeader.getSampleNum()) || |
168 | | - (pairNumber != Integer.MIN_VALUE && pairNumber != parsedHeader.getPairNumber())) |
169 | | - throw new IllegalStateException("Only one sample ID is allowed per fastq file."); |
170 | | - sampleIdx = parsedHeader.getSampleNum(); |
171 | | - pairNumber = parsedHeader.getPairNumber(); |
172 | | - totalReads++; |
173 | | - } |
174 | 189 |
|
175 | | - String error = addToPairingInfoMap(fileName, fileNameWithoutPairingInfoMap, totalReads); |
176 | | - if(null != error) |
177 | | - { |
178 | | - _logger.error(error); |
179 | | - reader.close(); |
180 | | - throw new PipelineJobException(); |
181 | | - } |
182 | | - else if(reader.getLineNumber() == 1 && totalReads == 0 && !f.getName().contains("null"))//empty file |
183 | | - { |
184 | | - _logger.warn("File " + fileName + " has no content to parse."); |
185 | | - reader.close(); |
186 | | - continue; |
187 | | - } |
188 | | - else |
189 | | - { |
190 | | - reader.close(); |
191 | | - Integer sampleId = _sampleIndexToIdMap.get(sampleIdx); |
192 | | - if (sampleIdx != 0 && sampleId == null && sampleName == null) |
| 190 | + String error = addToPairingInfoMap(fileName, fileNameWithoutPairingInfoMap, totalReads); |
| 191 | + if (null != error) |
193 | 192 | { |
194 | | - throw new PipelineJobException("Could not resolve id for sample at index " + sampleIdx + ". Sample map is: " + _sampleIndexToIdMap); |
| 193 | + _logger.error(error); |
| 194 | + reader.close(); |
| 195 | + throw new PipelineJobException(); |
195 | 196 | } |
196 | | - if (sampleId == null && sampleName != null) |
| 197 | + else if (reader.getLineNumber() == 1 && totalReads == 0 && !f.getName().contains("null"))//empty file |
197 | 198 | { |
198 | | - sampleId = _sampleNameToIdMap.get(sampleName); |
| 199 | + _logger.warn("File " + fileName + " has no content to parse."); |
| 200 | + reader.close(); |
| 201 | + continue; |
199 | 202 | } |
200 | | - String name = (_outputPrefix == null ? "Reads" : _outputPrefix) + "-R" + pairNumber + "-" + (sampleIdx == 0 ? "Control" : sampleId) + ".fastq.gz"; |
201 | | - File newFile = new File(targetDir, name); |
202 | | - |
203 | | - if (!f.equals(newFile)) |
| 203 | + else |
204 | 204 | { |
205 | | - filesToMove.put(f, newFile); |
206 | | - } |
207 | | - Pair<Integer, Integer> key = Pair.of(sampleId, pairNumber); |
208 | | - _fileInfo.put(key, new FileInfo(newFile, totalReads)); |
| 205 | + reader.close(); |
| 206 | + Integer sampleId = _sampleIndexToIdMap.get(sampleIdx); |
| 207 | + if (sampleIdx != 0 && sampleId == null && sampleName == null) |
| 208 | + { |
| 209 | + throw new PipelineJobException("Could not resolve id for sample at index " + sampleIdx + ". Sample map is: " + _sampleIndexToIdMap); |
| 210 | + } |
| 211 | + if (sampleId == null && sampleName != null) |
| 212 | + { |
| 213 | + sampleId = _sampleNameToIdMap.get(sampleName); |
| 214 | + } |
| 215 | + String name = (_outputPrefix == null ? "Reads" : _outputPrefix) + "-R" + pairNumber + "-" + (sampleIdx == 0 ? "Control" : sampleId) + ".fastq.gz"; |
| 216 | + File newFile = new File(targetDir, name); |
| 217 | + |
| 218 | + if (!f.equals(newFile)) |
| 219 | + { |
| 220 | + filesToMove.put(f, newFile); |
| 221 | + } |
| 222 | + Pair<Integer, Integer> key = Pair.of(sampleId, pairNumber); |
| 223 | + _fileInfo.put(key, new FileInfo(newFile, totalReads)); |
209 | 224 |
|
210 | | - _logger.info("Finished parsing file: " + f.getName()); |
| 225 | + _logger.info("Finished parsing file: " + f.getName()); |
| 226 | + } |
| 227 | + } |
| 228 | + } |
| 229 | + catch (IOException e) |
| 230 | + { |
| 231 | + throw new PipelineJobException(e); |
| 232 | + } |
| 233 | + finally |
| 234 | + { |
| 235 | + if (tempFile != null) |
| 236 | + { |
| 237 | + tempFile.delete(); |
211 | 238 | } |
212 | 239 | } |
213 | 240 | index++; |
|
0 commit comments