Skip to content

Commit 3745456

Browse files
authored
Support for validating libraries built with DIA-NN v2.0 results (#543)
- Added support for BLIB files built with DIA-NN v2.0 results - Report file is a Parquet file - Since file extensions are not included in the raw file names, allow base name matching with valid raw files - Updated test, and added test files
1 parent ae00ef4 commit 3745456

File tree

14 files changed

+162
-37
lines changed

14 files changed

+162
-37
lines changed

panoramapublic/src/org/labkey/panoramapublic/model/validation/SpecLibSourceFile.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,9 @@ public int hashCode()
8888
public JSONObject toJSON(Container container)
8989
{
9090
JSONObject jsonObject = super.toJSON(container);
91-
if (isIdFile() && LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(getName()) && !found())
91+
if (isIdFile() && LibSourceFile.DIANN_REPORT_PLACEHOLDER.equals(getName()) && !found())
9292
{
93-
jsonObject.put("statusDetails", "The DIA-NN TSV report must be in the same directory as the " +
93+
jsonObject.put("statusDetails", "The DIA-NN report file (.parquet or .tsv) must be in the same directory as the " +
9494
".speclib, and share some leading characters in the file name");
9595
}
9696
return jsonObject;

panoramapublic/src/org/labkey/panoramapublic/proteomexchange/validator/SpecLibValidator.java

Lines changed: 76 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545

4646
public class SpecLibValidator extends SpecLibValidation<ValidatorSkylineDocSpecLib>
4747
{
48+
private static final List<String> RAW_FILE_TYPES = List.of("raw", "wiff", "lcd", "d", "mzxml", "mzml");
49+
private static final String TSV = "tsv";
50+
private static final String PARQUET = "parquet";
51+
4852
private List<ValidatorSkylineDocSpecLib> _docsWithLibrary;
4953
private SpecLibKeyWithSize _key;
5054
private SpecLibInfo _specLibInfo;
@@ -188,20 +192,22 @@ private static List<LibSourceFile> getLibSources(SpecLibReader libReader, ISpect
188192
}
189193
else if (sourceFiles.stream().anyMatch(LibSourceFile::isDiannSearch))
190194
{
191-
// Building a library with DIA-NN results in Skyline requires a .speclib file and a report TSV file.
192-
// The .blib file includes the name of .speclib but not the name of the report TSV file.
193-
// Building a library without the TSV gives this error message in Skyline:
194-
// "...the TSV report is required to read speclib files and must be in the same directory as the speclib
195-
// and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.tsv)..."
195+
// Building a library with DIA-NN results in Skyline requires a .speclib file and a report file (.parquet or .tsv).
196+
// The .blib file includes the name of .speclib but not the name of the report file.
197+
// Building a library without the report file gives this error message in Skyline:
198+
// "...the Parquet or TSV report is required to read speclib files and must be in the same directory as the speclib
199+
// and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.parquet)..."
196200

197201
// At some point Skyline may start including the names of all source files in the .blib SQLite file,
198-
// so first check if any TSV files were listed as sources in the .blib
199-
boolean hasTsvFiles = sourceFiles.stream()
200-
.anyMatch(file -> file.hasIdFile() && file.getIdFile().toLowerCase().endsWith(".tsv"));
201-
if (!hasTsvFiles)
202+
// so first check if any Parquet or TSV files were listed as sources in the .blib
203+
boolean hasReportFiles = sourceFiles.stream()
204+
.anyMatch(file -> file.hasIdFile()
205+
&& (TSV.equals(FileUtil.getExtension(file.getIdFile().toLowerCase()))
206+
|| PARQUET.equals(FileUtil.getExtension(file.getIdFile().toLowerCase()))));
207+
if (!hasReportFiles)
202208
{
203-
// If there is no TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
204-
sourceFiles.add(new LibSourceFile(null, LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, null));
209+
// If there is no Parquet or TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
210+
sourceFiles.add(new LibSourceFile(null, LibSourceFile.DIANN_REPORT_PLACEHOLDER, null));
205211
}
206212
}
207213

@@ -254,7 +260,10 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
254260
if (source.hasSpectrumSourceFile() && !checkedFiles.contains(ssf))
255261
{
256262
checkedFiles.add(ssf);
257-
Path path = getPath(ssf, rawFilesDirPaths, source.isMaxQuantSearch(), fcs);
263+
// Libraries built with MaxQuant or DIA-NN v2.0 results may only have the base raw file names (without extension)
264+
// stored in the BLIB. If the library source is either MaxQuant or DIA-NN we will compare with base file names of valid raw files.
265+
boolean allowBaseName = source.isMaxQuantSearch() || source.isDiannSearch();
266+
Path path = getPath(ssf, rawFilesDirPaths, allowBaseName);
258267
SpecLibSourceFile sourceFile = new SpecLibSourceFile(ssf, SPECTRUM);
259268
sourceFile.setSpecLibValidationId(getId());
260269
sourceFile.setPath(path != null ? path.toString() : DataFile.NOT_FOUND);
@@ -263,24 +272,24 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
263272
String idFile = source.getIdFile();
264273
if (source.hasIdFile() && !checkedFiles.contains(idFile))
265274
{
266-
if (LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER.equals(idFile)) continue; // We will look for this when we come to the .speclib file
275+
if (LibSourceFile.DIANN_REPORT_PLACEHOLDER.equals(idFile)) continue; // We will look for this when we come to the .speclib file
267276

268277
checkedFiles.add(idFile);
269-
Path path = getPath(idFile, rawFilesDirPaths, false, fcs);
278+
Path path = getPath(idFile, rawFilesDirPaths, false);
270279
SpecLibSourceFile sourceFile = new SpecLibSourceFile(idFile, PEPTIDE_ID);
271280
sourceFile.setSpecLibValidationId(getId());
272281
sourceFile.setPath(path != null ? path.toString() : DataFile.NOT_FOUND);
273282
idFiles.add(sourceFile);
274283

275284
if (source.isDiannSearch())
276285
{
277-
// If this is a DIA-NN .speclib file, check for the required report TSV file.
278-
// We are doing this because the .blib does not include the name of the report TSV file.
279-
// We only know that: "the TSV report is required to read speclib files and must be in the
286+
// If this is a DIA-NN .speclib file, check for the required report file (Parquet or TSV).
287+
// We are doing this because the .blib does not include the name of the report file.
288+
// We only know that: "the Parquet or TSV report is required to read speclib files and must be in the
280289
// same directory as the speclib and share some leading characters
281-
// (e.g. somedata-tsv.speclib and somedata-report.tsv)"
290+
// (e.g. somedata-tsv.speclib and somedata-report.parquet)"
282291
Path reportFilePath = sourceFile.found() ? getDiannReportFilePath(path) : null;
283-
SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile(LibSourceFile.DIANN_REPORT_TSV_PLACEHOLDER, PEPTIDE_ID);
292+
SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile(LibSourceFile.DIANN_REPORT_PLACEHOLDER, PEPTIDE_ID);
284293
diannReportSourceFile.setSpecLibValidationId(getId());
285294
diannReportSourceFile.setPath(reportFilePath != null ? reportFilePath.toString() : DataFile.NOT_FOUND);
286295
idFiles.add(diannReportSourceFile);
@@ -292,11 +301,11 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
292301
setIdFiles(idFiles);
293302
}
294303

295-
private Path getPath(String name, Set<Path> rawFilesDirPaths, boolean isMaxquant, FileContentService fcs)
304+
private Path getPath(String name, Set<Path> rawFilesDirPaths, boolean allowBaseName)
296305
{
297306
for (Path rawFilesDir: rawFilesDirPaths)
298307
{
299-
Path path = findInDirectoryTree(rawFilesDir, name, isMaxquant);
308+
Path path = findInDirectoryTree(rawFilesDir, name, allowBaseName);
300309
if (path != null)
301310
{
302311
return path;
@@ -321,7 +330,19 @@ private static Path getDiannReportFilePath(Path speclibFilePath)
321330

322331
private static Path getDiannReportFilePath(String specLibFileName, List<Path> candidateFiles)
323332
{
324-
Map<Path, Integer> prefixLengthMap = getCommonPrefixLengthsForTsvFiles(candidateFiles, specLibFileName);
333+
// First look for a matching Parquet file
334+
Map<Path, Integer> prefixLengthMap = getCommonPrefixLengthsForParquetFiles(candidateFiles, specLibFileName);
335+
// Find the Parquet file with the longest common prefix
336+
Path parquetFile = prefixLengthMap.entrySet().stream()
337+
.sorted((entry1, entry2) -> Integer.compare(entry2.getValue(), entry1.getValue())) // Sort descending by matching prefix length
338+
.map(Map.Entry::getKey) // File paths
339+
.findFirst() // Get the first file that meets the conditions
340+
.orElse(null);
341+
if (parquetFile != null) return parquetFile;
342+
343+
344+
// Look for a matching TSV file if we did not find a Parquet file
345+
prefixLengthMap = getCommonPrefixLengthsForTsvFiles(candidateFiles, specLibFileName);
325346

326347
// Find the TSV file with the longest common prefix that also has the expected column headers in the first line
327348
return prefixLengthMap.entrySet().stream()
@@ -332,12 +353,12 @@ private static Path getDiannReportFilePath(String specLibFileName, List<Path> ca
332353
.orElse(null);
333354
}
334355

335-
private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> files, String specLibFileName)
356+
private static Map<Path, Integer> getCommonPrefixLengths(List<Path> files, String specLibFileName, String fileExtension)
336357
{
337358
String specLibFileBaseName = FileUtil.getBaseName(specLibFileName); // Remove file extension
338359
Map<Path, Integer> prefixLengthMap = new HashMap<>();
339360
files.stream()
340-
.filter(file -> file.getFileName().toString().toLowerCase().endsWith(".tsv")) // Ensure it's a TSV file
361+
.filter(file -> fileExtension.equals(FileUtil.getExtension(file.getFileName().toString().toLowerCase())))
341362
.forEach(file -> {
342363
// Get the longest common prefix length
343364
int commonPrefixLength = commonPrefixLength(specLibFileBaseName, FileUtil.getBaseName(file.getFileName().toString()));
@@ -350,6 +371,16 @@ private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> f
350371
return prefixLengthMap;
351372
}
352373

374+
private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> files, String specLibFileName)
375+
{
376+
return getCommonPrefixLengths(files, specLibFileName, TSV);
377+
}
378+
379+
private static Map<Path, Integer> getCommonPrefixLengthsForParquetFiles(List<Path> files, String specLibFileName)
380+
{
381+
return getCommonPrefixLengths(files, specLibFileName, PARQUET);
382+
}
383+
353384
private static int commonPrefixLength(String s1, String s2)
354385
{
355386
int maxLength = Math.min(s1.length(), s2.length());
@@ -418,7 +449,7 @@ private Path findInDirectoryTree(java.nio.file.Path rawFilesDirPath, String file
418449
return filePath;
419450
}
420451

421-
// Look for zip files
452+
// Look for zip files, of raw files with matching base names if we are allowing basename matching.
422453
try (Stream<Path> list = Files.list(rawFilesDirPath).filter(p -> FileUtil.getFileName(p).startsWith(fileName)))
423454
{
424455
for (Path path : list.collect(Collectors.toList()))
@@ -438,14 +469,27 @@ private static boolean accept(String fileName, String uploadedFileName)
438469
return accept(fileName, uploadedFileName, false);
439470
}
440471

441-
private static boolean accept(String fileName, String uploadedFileName, boolean allowBasenameOnly)
472+
private static boolean accept(String fileName, String uploadedFileName, boolean allowBaseName)
442473
{
443474
// Accept QC_10.9.17.raw OR for QC_10.9.17.raw.zip
444475
// 170428_DBS_cal_7a.d OR 170428_DBS_cal_7a.d.zip
445-
String ext = FileUtil.getExtension(uploadedFileName).toLowerCase();
476+
// If allowBaseName is set to true, accept
477+
// B_240207_IO5x75_HeLa_400ng.raw (or another valid raw file extension) for B_240207_IO5x75_HeLa_400ng
478+
String ext = FileUtil.getExtension(uploadedFileName);
479+
ext = ext != null ? ext.toLowerCase() : "";
446480
return fileName.equals(uploadedFileName)
447481
|| ext.equals("zip") && fileName.equals(FileUtil.getBaseName(uploadedFileName))
448-
|| (allowBasenameOnly && fileName.equals(FileUtil.getBaseName(uploadedFileName)));
482+
|| (allowBaseName && fileName.equals(getUploadedRawFileBaseName(uploadedFileName)));
483+
}
484+
485+
private static String getUploadedRawFileBaseName(String uploadedFileName)
486+
{
487+
String ext = FileUtil.getExtension(uploadedFileName.toLowerCase());
488+
if (!RAW_FILE_TYPES.stream().anyMatch(type -> type.equals(ext)))
489+
{
490+
return null;
491+
}
492+
return FileUtil.getBaseName(uploadedFileName);
449493
}
450494

451495
public static class SpecLibKeyWithSize
@@ -567,6 +611,10 @@ public void testAccept()
567611
// Accept 170428_DBS_cal_7a.d OR 170428_DBS_cal_7a.d.zip
568612
assertTrue(accept("170428_DBS_cal_7a.d", "170428_DBS_cal_7a.d"));
569613
assertTrue(accept("170428_DBS_cal_7a.d", "170428_DBS_cal_7a.d.zip"));
614+
615+
assertFalse(accept("B_240207_IO5x75_HeLa_400ng", "B_240207_IO5x75_HeLa_400ng.raw"));
616+
assertTrue(accept("B_240207_IO5x75_HeLa_400ng", "B_240207_IO5x75_HeLa_400ng.raw", true));
617+
assertFalse(accept("B_240207_IO5x75_HeLa_400ng", "B_240207_IO5x75_HeLa_400ng.txt", true));
570618
}
571619

572620
@Test

panoramapublic/src/org/labkey/panoramapublic/speclib/LibSourceFile.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ public boolean isMaxQuantSearch()
8282
return (hasIdFile() && getIdFile().endsWith("msms.txt")) || containsScoreType("MAXQUANT SCORE");
8383
}
8484

85-
public static String DIANN_REPORT_TSV_PLACEHOLDER = "DIA-NN report file";
85+
public static String DIANN_REPORT_PLACEHOLDER = "DIA-NN report file";
8686

8787
// These are some of the column headers that we expect to see in a DIA-NN report TSV file
8888
public static List<String> DIANN_REPORT_EXPECTED_HEADERS = List.of("File.Name", "Run", "Protein.Group", "Protein.Ids", "Protein.Names");
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Dummy file for testing.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Dummy file for testing.
Binary file not shown.
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
Files downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab)
1+
- Files used to build the library test_diann_library.blib were downloaded from https://panoramaweb.org/QuickProt_datasets.url (Ranish lab).
2+
- test_diann_V2_library.blib was built with test Skyline files downloaded from https://github.com/ProteoWizard/pwiz/tree/master/pwiz_tools/BiblioSpec/tests/inputs/diann2-synchro-pasef.
3+
Raw files used for testing validation of test_diann_V2_library.blib are dummy files.
Binary file not shown.

0 commit comments

Comments
 (0)