4545
4646public class SpecLibValidator extends SpecLibValidation <ValidatorSkylineDocSpecLib >
4747{
48+ private static final List <String > RAW_FILE_TYPES = List .of ("raw" , "wiff" , "lcd" , "d" , "mzxml" , "mzml" );
49+ private static final String TSV = "tsv" ;
50+ private static final String PARQUET = "parquet" ;
51+
4852 private List <ValidatorSkylineDocSpecLib > _docsWithLibrary ;
4953 private SpecLibKeyWithSize _key ;
5054 private SpecLibInfo _specLibInfo ;
@@ -188,20 +192,22 @@ private static List<LibSourceFile> getLibSources(SpecLibReader libReader, ISpect
188192 }
189193 else if (sourceFiles .stream ().anyMatch (LibSourceFile ::isDiannSearch ))
190194 {
191- // Building a library with DIA-NN results in Skyline requires a .speclib file and a report TSV file.
192- // The .blib file includes the name of .speclib but not the name of the report TSV file.
193- // Building a library without the TSV gives this error message in Skyline:
194- // "...the TSV report is required to read speclib files and must be in the same directory as the speclib
195- // and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.tsv )..."
195+ // Building a library with DIA-NN results in Skyline requires a .speclib file and a report file (.parquet or .tsv) .
196+ // The .blib file includes the name of .speclib but not the name of the report file.
197+ // Building a library without the report file gives this error message in Skyline:
198+ // "...the Parquet or TSV report is required to read speclib files and must be in the same directory as the speclib
199+ // and share some leading characters (e.g. somedata-tsv.speclib and somedata-report.parquet )..."
196200
197201 // At some point Skyline may start including the names of all source files in the .blib SQLite file,
198- // so first check if any TSV files were listed as sources in the .blib
199- boolean hasTsvFiles = sourceFiles .stream ()
200- .anyMatch (file -> file .hasIdFile () && file .getIdFile ().toLowerCase ().endsWith (".tsv" ));
201- if (!hasTsvFiles )
202+ // so first check if any Parquet or TSV files were listed as sources in the .blib
203+ boolean hasReportFiles = sourceFiles .stream ()
204+ .anyMatch (file -> file .hasIdFile ()
205+ && (TSV .equals (FileUtil .getExtension (file .getIdFile ().toLowerCase ()))
206+ || PARQUET .equals (FileUtil .getExtension (file .getIdFile ().toLowerCase ()))));
207+ if (!hasReportFiles )
202208 {
203- // If there is no TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
204- sourceFiles .add (new LibSourceFile (null , LibSourceFile .DIANN_REPORT_TSV_PLACEHOLDER , null ));
209+ // If there is no Parquet or TSV source listed in the .blib, then add a placeholder for the DIA-NN report file.
210+ sourceFiles .add (new LibSourceFile (null , LibSourceFile .DIANN_REPORT_PLACEHOLDER , null ));
205211 }
206212 }
207213
@@ -254,7 +260,10 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
254260 if (source .hasSpectrumSourceFile () && !checkedFiles .contains (ssf ))
255261 {
256262 checkedFiles .add (ssf );
257- Path path = getPath (ssf , rawFilesDirPaths , source .isMaxQuantSearch (), fcs );
263+ // Libraries built with MaxQuant or DIA-NN v2.0 results may only have the base raw file names (without extension)
264+ // stored in the BLIB. If the library source is either MaxQuant or DIA-NN we will compare with base file names of valid raw files.
265+ boolean allowBaseName = source .isMaxQuantSearch () || source .isDiannSearch ();
266+ Path path = getPath (ssf , rawFilesDirPaths , allowBaseName );
258267 SpecLibSourceFile sourceFile = new SpecLibSourceFile (ssf , SPECTRUM );
259268 sourceFile .setSpecLibValidationId (getId ());
260269 sourceFile .setPath (path != null ? path .toString () : DataFile .NOT_FOUND );
@@ -263,24 +272,24 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
263272 String idFile = source .getIdFile ();
264273 if (source .hasIdFile () && !checkedFiles .contains (idFile ))
265274 {
266- if (LibSourceFile .DIANN_REPORT_TSV_PLACEHOLDER .equals (idFile )) continue ; // We will look for this when we come to the .speclib file
275+ if (LibSourceFile .DIANN_REPORT_PLACEHOLDER .equals (idFile )) continue ; // We will look for this when we come to the .speclib file
267276
268277 checkedFiles .add (idFile );
269- Path path = getPath (idFile , rawFilesDirPaths , false , fcs );
278+ Path path = getPath (idFile , rawFilesDirPaths , false );
270279 SpecLibSourceFile sourceFile = new SpecLibSourceFile (idFile , PEPTIDE_ID );
271280 sourceFile .setSpecLibValidationId (getId ());
272281 sourceFile .setPath (path != null ? path .toString () : DataFile .NOT_FOUND );
273282 idFiles .add (sourceFile );
274283
275284 if (source .isDiannSearch ())
276285 {
277- // If this is a DIA-NN .speclib file, check for the required report TSV file.
278- // We are doing this because the .blib does not include the name of the report TSV file.
279- // We only know that: "the TSV report is required to read speclib files and must be in the
286+ // If this is a DIA-NN .speclib file, check for the required report file (Parquet or TSV) .
287+ // We are doing this because the .blib does not include the name of the report file.
288+ // We only know that: "the Parquet or TSV report is required to read speclib files and must be in the
280289 // same directory as the speclib and share some leading characters
281- // (e.g. somedata-tsv.speclib and somedata-report.tsv )"
290+ // (e.g. somedata-tsv.speclib and somedata-report.parquet )"
282291 Path reportFilePath = sourceFile .found () ? getDiannReportFilePath (path ) : null ;
283- SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile (LibSourceFile .DIANN_REPORT_TSV_PLACEHOLDER , PEPTIDE_ID );
292+ SpecLibSourceFile diannReportSourceFile = new SpecLibSourceFile (LibSourceFile .DIANN_REPORT_PLACEHOLDER , PEPTIDE_ID );
284293 diannReportSourceFile .setSpecLibValidationId (getId ());
285294 diannReportSourceFile .setPath (reportFilePath != null ? reportFilePath .toString () : DataFile .NOT_FOUND );
286295 idFiles .add (diannReportSourceFile );
@@ -292,11 +301,11 @@ private void validateLibrarySources(List<LibSourceFile> sources, FileContentServ
292301 setIdFiles (idFiles );
293302 }
294303
295- private Path getPath (String name , Set <Path > rawFilesDirPaths , boolean isMaxquant , FileContentService fcs )
304+ private Path getPath (String name , Set <Path > rawFilesDirPaths , boolean allowBaseName )
296305 {
297306 for (Path rawFilesDir : rawFilesDirPaths )
298307 {
299- Path path = findInDirectoryTree (rawFilesDir , name , isMaxquant );
308+ Path path = findInDirectoryTree (rawFilesDir , name , allowBaseName );
300309 if (path != null )
301310 {
302311 return path ;
@@ -321,7 +330,19 @@ private static Path getDiannReportFilePath(Path speclibFilePath)
321330
322331 private static Path getDiannReportFilePath (String specLibFileName , List <Path > candidateFiles )
323332 {
324- Map <Path , Integer > prefixLengthMap = getCommonPrefixLengthsForTsvFiles (candidateFiles , specLibFileName );
333+ // First look for a matching Parquet file
334+ Map <Path , Integer > prefixLengthMap = getCommonPrefixLengthsForParquetFiles (candidateFiles , specLibFileName );
335+ // Find the Parquet file with the longest common prefix
336+ Path parquetFile = prefixLengthMap .entrySet ().stream ()
337+ .sorted ((entry1 , entry2 ) -> Integer .compare (entry2 .getValue (), entry1 .getValue ())) // Sort descending by matching prefix length
338+ .map (Map .Entry ::getKey ) // File paths
339+ .findFirst () // Get the first file that meets the conditions
340+ .orElse (null );
341+ if (parquetFile != null ) return parquetFile ;
342+
343+
344+ // Look for a matching TSV file if we did not find a Parquet file
345+ prefixLengthMap = getCommonPrefixLengthsForTsvFiles (candidateFiles , specLibFileName );
325346
326347 // Find the TSV file with the longest common prefix that also has the expected column headers in the first line
327348 return prefixLengthMap .entrySet ().stream ()
@@ -332,12 +353,12 @@ private static Path getDiannReportFilePath(String specLibFileName, List<Path> ca
332353 .orElse (null );
333354 }
334355
335- private static Map <Path , Integer > getCommonPrefixLengthsForTsvFiles (List <Path > files , String specLibFileName )
356+ private static Map <Path , Integer > getCommonPrefixLengths (List <Path > files , String specLibFileName , String fileExtension )
336357 {
337358 String specLibFileBaseName = FileUtil .getBaseName (specLibFileName ); // Remove file extension
338359 Map <Path , Integer > prefixLengthMap = new HashMap <>();
339360 files .stream ()
340- .filter (file -> file .getFileName ().toString ().toLowerCase (). endsWith ( ".tsv" )) // Ensure it's a TSV file
361+ .filter (file -> fileExtension . equals ( FileUtil . getExtension ( file .getFileName ().toString ().toLowerCase ())))
341362 .forEach (file -> {
342363 // Get the longest common prefix length
343364 int commonPrefixLength = commonPrefixLength (specLibFileBaseName , FileUtil .getBaseName (file .getFileName ().toString ()));
@@ -350,6 +371,16 @@ private static Map<Path, Integer> getCommonPrefixLengthsForTsvFiles(List<Path> f
350371 return prefixLengthMap ;
351372 }
352373
374+ private static Map <Path , Integer > getCommonPrefixLengthsForTsvFiles (List <Path > files , String specLibFileName )
375+ {
376+ return getCommonPrefixLengths (files , specLibFileName , TSV );
377+ }
378+
379+ private static Map <Path , Integer > getCommonPrefixLengthsForParquetFiles (List <Path > files , String specLibFileName )
380+ {
381+ return getCommonPrefixLengths (files , specLibFileName , PARQUET );
382+ }
383+
353384 private static int commonPrefixLength (String s1 , String s2 )
354385 {
355386 int maxLength = Math .min (s1 .length (), s2 .length ());
@@ -418,7 +449,7 @@ private Path findInDirectoryTree(java.nio.file.Path rawFilesDirPath, String file
418449 return filePath ;
419450 }
420451
421- // Look for zip files
452+ // Look for zip files, of raw files with matching base names if we are allowing basename matching.
422453 try (Stream <Path > list = Files .list (rawFilesDirPath ).filter (p -> FileUtil .getFileName (p ).startsWith (fileName )))
423454 {
424455 for (Path path : list .collect (Collectors .toList ()))
@@ -438,14 +469,27 @@ private static boolean accept(String fileName, String uploadedFileName)
438469 return accept (fileName , uploadedFileName , false );
439470 }
440471
441- private static boolean accept (String fileName , String uploadedFileName , boolean allowBasenameOnly )
472+ private static boolean accept (String fileName , String uploadedFileName , boolean allowBaseName )
442473 {
443474 // Accept QC_10.9.17.raw OR for QC_10.9.17.raw.zip
444475 // 170428_DBS_cal_7a.d OR 170428_DBS_cal_7a.d.zip
445- String ext = FileUtil .getExtension (uploadedFileName ).toLowerCase ();
476+ // If allowBaseName is set to true, accept
477+ // B_240207_IO5x75_HeLa_400ng.raw (or another valid raw file extension) for B_240207_IO5x75_HeLa_400ng
478+ String ext = FileUtil .getExtension (uploadedFileName );
479+ ext = ext != null ? ext .toLowerCase () : "" ;
446480 return fileName .equals (uploadedFileName )
447481 || ext .equals ("zip" ) && fileName .equals (FileUtil .getBaseName (uploadedFileName ))
448- || (allowBasenameOnly && fileName .equals (FileUtil .getBaseName (uploadedFileName )));
482+ || (allowBaseName && fileName .equals (getUploadedRawFileBaseName (uploadedFileName )));
483+ }
484+
485+ private static String getUploadedRawFileBaseName (String uploadedFileName )
486+ {
487+ String ext = FileUtil .getExtension (uploadedFileName .toLowerCase ());
488+ if (!RAW_FILE_TYPES .stream ().anyMatch (type -> type .equals (ext )))
489+ {
490+ return null ;
491+ }
492+ return FileUtil .getBaseName (uploadedFileName );
449493 }
450494
451495 public static class SpecLibKeyWithSize
@@ -567,6 +611,10 @@ public void testAccept()
567611 // Accept 170428_DBS_cal_7a.d OR 170428_DBS_cal_7a.d.zip
568612 assertTrue (accept ("170428_DBS_cal_7a.d" , "170428_DBS_cal_7a.d" ));
569613 assertTrue (accept ("170428_DBS_cal_7a.d" , "170428_DBS_cal_7a.d.zip" ));
614+
615+ assertFalse (accept ("B_240207_IO5x75_HeLa_400ng" , "B_240207_IO5x75_HeLa_400ng.raw" ));
616+ assertTrue (accept ("B_240207_IO5x75_HeLa_400ng" , "B_240207_IO5x75_HeLa_400ng.raw" , true ));
617+ assertFalse (accept ("B_240207_IO5x75_HeLa_400ng" , "B_240207_IO5x75_HeLa_400ng.txt" , true ));
570618 }
571619
572620 @ Test
0 commit comments