Skip to content

Commit ac98c98

Browse files
Merge 23.6 to develop
2 parents ede4593 + e269579 commit ac98c98

1 file changed

Lines changed: 97 additions & 70 deletions

File tree

genotyping/src/org/labkey/genotyping/IlluminaFastqParser.java

Lines changed: 97 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
import org.labkey.api.pipeline.PipelineJob;
3636
import org.labkey.api.pipeline.PipelineJobException;
3737
import org.labkey.api.test.TestWhen;
38+
import org.labkey.api.util.FileUtil;
3839
import org.labkey.api.util.JunitUtil;
3940
import org.labkey.api.util.Pair;
4041
import org.labkey.api.util.TestContext;
@@ -117,97 +118,123 @@ public Map<Pair<Integer, Integer>, FileInfo> parseFastqFiles(PipelineJob job) th
117118
int index = 1;
118119
for (File f : _files)
119120
{
120-
if(job != null)
121+
if (job != null)
121122
job.setStatus("PARSING FILE " + index + " OF " + _files.size());
122123

123-
if(f.length() == 0)
124+
long length = f.length();
125+
if (length == 0)
124126
{
125127
_logger.info("File " + f.getName() + " has no content to parse.");
126128
continue;
127129
}
128-
_logger.info("Beginning to parse file: " + f.getName());
129-
try (FastqReader reader = new FastqReader(f))
130+
131+
File tempFile = null;
132+
133+
try
130134
{
131-
File targetDir = f.getParentFile();
132-
String fileName = f.getName();
133-
134-
int sampleIdx = Integer.MIN_VALUE;
135-
String sampleName = null;
136-
int pairNumber = Integer.MIN_VALUE;
137-
int totalReads = 0;
138-
while (reader.hasNext())
135+
// Copy to a temp file for parsing for perf reasons. See issue 48029
136+
// Ideally we'd avoid the copy when the file is already on a local file system, but there's no
137+
// good way to check if a file is truly local
138+
tempFile = FileUtil.createTempFile(FileUtil.getBaseName(f) + ".", "." + FileUtil.getExtension(f));
139+
tempFile.deleteOnExit();
140+
_logger.debug("Copying to temp file " + tempFile + ", size is " + f.length() + " bytes");
141+
FileUtil.copyFile(f, tempFile);
142+
143+
_logger.info("Beginning to parse file: " + f.getName());
144+
try (FastqReader reader = new FastqReader(tempFile))
139145
{
140-
FastqRecord fq = reader.next();
141-
String header = fq.getReadName();
142-
IlluminaReadHeader parsedHeader = new IlluminaReadHeader(header, fileName);
143-
if(parsedHeader.getSampleName() != null) // may be new header format, so let's try alternate lookup
146+
File targetDir = f.getParentFile();
147+
String fileName = f.getName();
148+
149+
int sampleIdx = Integer.MIN_VALUE;
150+
String sampleName = null;
151+
int pairNumber = Integer.MIN_VALUE;
152+
int totalReads = 0;
153+
while (reader.hasNext())
144154
{
145-
sampleName = parsedHeader.getSampleName();
146-
147-
// First try to resolve as a sample name
148-
Integer sampleId = _sampleNameToIdMap.get(parsedHeader.getSampleName());
149-
if (sampleId == null)
155+
FastqRecord fq = reader.next();
156+
String header = fq.getReadName();
157+
IlluminaReadHeader parsedHeader = new IlluminaReadHeader(header, fileName);
158+
if (parsedHeader.getSampleName() != null) // may be new header format, so let's try alternate lookup
150159
{
151-
try
152-
{
153-
sampleId = Integer.parseInt(parsedHeader.getSampleName());
154-
}
155-
catch(NumberFormatException e)
156-
{
157-
throw new PipelineJobException("Could not resolve sample ID for sample named '" + parsedHeader.getSampleName() + "'. Sample map is: " + _sampleNameToIdMap);
158-
}
159-
Integer sampleIndex = _sampleIdToIndexMap.get(sampleId);
160-
if (sampleIndex == null)
160+
sampleName = parsedHeader.getSampleName();
161+
162+
// First try to resolve as a sample name
163+
Integer sampleId = _sampleNameToIdMap.get(parsedHeader.getSampleName());
164+
if (sampleId == null)
161165
{
162-
throw new PipelineJobException("Could not resolve Sample Index for Sample ID: " + sampleId + ". Id to Index mapping is: " + _sampleIdToIndexMap);
166+
try
167+
{
168+
sampleId = Integer.parseInt(parsedHeader.getSampleName());
169+
}
170+
catch (NumberFormatException e)
171+
{
172+
throw new PipelineJobException("Could not resolve sample ID for sample named '" + parsedHeader.getSampleName() + "'. Sample map is: " + _sampleNameToIdMap);
173+
}
174+
Integer sampleIndex = _sampleIdToIndexMap.get(sampleId);
175+
if (sampleIndex == null)
176+
{
177+
throw new PipelineJobException("Could not resolve Sample Index for Sample ID: " + sampleId + ". Id to Index mapping is: " + _sampleIdToIndexMap);
178+
}
179+
parsedHeader.setSampleNum(sampleIndex.intValue());
163180
}
164-
parsedHeader.setSampleNum(sampleIndex.intValue());
165181
}
182+
if ((sampleIdx != Integer.MIN_VALUE && sampleIdx != parsedHeader.getSampleNum()) ||
183+
(pairNumber != Integer.MIN_VALUE && pairNumber != parsedHeader.getPairNumber()))
184+
throw new IllegalStateException("Only one sample ID is allowed per fastq file.");
185+
sampleIdx = parsedHeader.getSampleNum();
186+
pairNumber = parsedHeader.getPairNumber();
187+
totalReads++;
166188
}
167-
if ((sampleIdx != Integer.MIN_VALUE && sampleIdx != parsedHeader.getSampleNum()) ||
168-
(pairNumber != Integer.MIN_VALUE && pairNumber != parsedHeader.getPairNumber()))
169-
throw new IllegalStateException("Only one sample ID is allowed per fastq file.");
170-
sampleIdx = parsedHeader.getSampleNum();
171-
pairNumber = parsedHeader.getPairNumber();
172-
totalReads++;
173-
}
174189

175-
String error = addToPairingInfoMap(fileName, fileNameWithoutPairingInfoMap, totalReads);
176-
if(null != error)
177-
{
178-
_logger.error(error);
179-
reader.close();
180-
throw new PipelineJobException();
181-
}
182-
else if(reader.getLineNumber() == 1 && totalReads == 0 && !f.getName().contains("null"))//empty file
183-
{
184-
_logger.warn("File " + fileName + " has no content to parse.");
185-
reader.close();
186-
continue;
187-
}
188-
else
189-
{
190-
reader.close();
191-
Integer sampleId = _sampleIndexToIdMap.get(sampleIdx);
192-
if (sampleIdx != 0 && sampleId == null && sampleName == null)
190+
String error = addToPairingInfoMap(fileName, fileNameWithoutPairingInfoMap, totalReads);
191+
if (null != error)
193192
{
194-
throw new PipelineJobException("Could not resolve id for sample at index " + sampleIdx + ". Sample map is: " + _sampleIndexToIdMap);
193+
_logger.error(error);
194+
reader.close();
195+
throw new PipelineJobException();
195196
}
196-
if (sampleId == null && sampleName != null)
197+
else if (reader.getLineNumber() == 1 && totalReads == 0 && !f.getName().contains("null"))//empty file
197198
{
198-
sampleId = _sampleNameToIdMap.get(sampleName);
199+
_logger.warn("File " + fileName + " has no content to parse.");
200+
reader.close();
201+
continue;
199202
}
200-
String name = (_outputPrefix == null ? "Reads" : _outputPrefix) + "-R" + pairNumber + "-" + (sampleIdx == 0 ? "Control" : sampleId) + ".fastq.gz";
201-
File newFile = new File(targetDir, name);
202-
203-
if (!f.equals(newFile))
203+
else
204204
{
205-
filesToMove.put(f, newFile);
206-
}
207-
Pair<Integer, Integer> key = Pair.of(sampleId, pairNumber);
208-
_fileInfo.put(key, new FileInfo(newFile, totalReads));
205+
reader.close();
206+
Integer sampleId = _sampleIndexToIdMap.get(sampleIdx);
207+
if (sampleIdx != 0 && sampleId == null && sampleName == null)
208+
{
209+
throw new PipelineJobException("Could not resolve id for sample at index " + sampleIdx + ". Sample map is: " + _sampleIndexToIdMap);
210+
}
211+
if (sampleId == null && sampleName != null)
212+
{
213+
sampleId = _sampleNameToIdMap.get(sampleName);
214+
}
215+
String name = (_outputPrefix == null ? "Reads" : _outputPrefix) + "-R" + pairNumber + "-" + (sampleIdx == 0 ? "Control" : sampleId) + ".fastq.gz";
216+
File newFile = new File(targetDir, name);
217+
218+
if (!f.equals(newFile))
219+
{
220+
filesToMove.put(f, newFile);
221+
}
222+
Pair<Integer, Integer> key = Pair.of(sampleId, pairNumber);
223+
_fileInfo.put(key, new FileInfo(newFile, totalReads));
209224

210-
_logger.info("Finished parsing file: " + f.getName());
225+
_logger.info("Finished parsing file: " + f.getName());
226+
}
227+
}
228+
}
229+
catch (IOException e)
230+
{
231+
throw new PipelineJobException(e);
232+
}
233+
finally
234+
{
235+
if (tempFile != null)
236+
{
237+
tempFile.delete();
211238
}
212239
}
213240
index++;

0 commit comments

Comments
 (0)