77import java .nio .file .Files ;
88import java .nio .file .Path ;
99import java .text .Normalizer ;
10+ import java .util .ArrayList ;
1011import java .util .Collection ;
1112import java .util .HashSet ;
13+ import java .util .List ;
1214import java .util .ResourceBundle ;
1315import java .util .Set ;
1416
1517import org .slf4j .Logger ;
1618import org .slf4j .LoggerFactory ;
1719import org .slf4j .helpers .MessageFormatter ;
1820
21+ import gov .loc .repository .bagit .domain .Manifest ;
22+ import gov .loc .repository .bagit .domain .Version ;
1923import gov .loc .repository .bagit .exceptions .InvalidBagitFileFormatException ;
24+ import gov .loc .repository .bagit .exceptions .MaliciousPathException ;
25+ import gov .loc .repository .bagit .exceptions .UnsupportedAlgorithmException ;
26+ import gov .loc .repository .bagit .hash .StandardBagitAlgorithmNameToSupportedAlgorithmMapping ;
27+ import gov .loc .repository .bagit .reader .ManifestReader ;
2028import gov .loc .repository .bagit .util .PathUtils ;
2129
2230/**
2331 * Part of the BagIt conformance suite.
2432 * This checker checks for various problems related to the manifests in a bag.
2533 */
26- @ SuppressWarnings ({"PMD.UseLocaleWithCaseConversions" })
34+ //TODO refactor to remove PMD warnings!
35+ @ SuppressWarnings ({"PMD.UseLocaleWithCaseConversions" , "PMD.TooManyMethods" , "PMD.GodClass" })
2736public final class ManifestChecker {
2837 private static final Logger logger = LoggerFactory .getLogger (ManifestChecker .class );
2938 private static final ResourceBundle messages = ResourceBundle .getBundle ("MessageBundle" );
@@ -34,46 +43,79 @@ public final class ManifestChecker {
3443 private static final String TRASHES_FILE = "\\ .(_.)?[Tt][Rr][Aa][Ss][Hh][Ee][Ss]" ;
3544 private static final String FS_EVENTS_FILE = "\\ .[Ff][Ss][Ee][Vv][Ee][Nn][Tt][Ss][Dd]" ;
3645 private static final String OS_FILES_REGEX = ".*data/(" + THUMBS_DB_FILE + "|" + DS_STORE_FILE + "|" + SPOTLIGHT_FILE + "|" + TRASHES_FILE + "|" + FS_EVENTS_FILE + ")" ;
46+ private static final Version VERSION_1_0 = new Version (1 ,0 );
3747
3848 private ManifestChecker (){
3949 //intentionally left empty
4050 }
4151
42- /*
52+ /**
4353 * Check for all the manifest specific potential problems
54+ *
55+ * @param version the version of the bag we are checking
56+ * @param bagitDir the directory where the manifests are stored
57+ * @param encoding the encoding of the manifests
58+ * @param warnings the set of warnings that will be appended to while checking
59+ * @param warningsToIgnore the set of warnings to ignore
60+ *
61+ * @throws IOException if there is a problem reading a file (because it doesn't exist)
62+ * @throws InvalidBagitFileFormatException if one (or more) of the files does not match the formatting as specified in the specification
63+ * @throws MaliciousPathException if someone crafted the bag to specifically try and write outside the bag directory
64+ * @throws UnsupportedAlgorithmException if a manifest uses an algorithm that the computer doesn't know how to use
4465 */
45- public static void checkManifests (final Path bagitDir , final Charset encoding , final Set <BagitWarning > warnings ,
46- final Collection <BagitWarning > warningsToIgnore ) throws IOException , InvalidBagitFileFormatException {
66+ //@SuppressWarnings("PMD.CyclomaticComplexity")
67+ public static void checkManifests (final Version version , final Path bagitDir , final Charset encoding , final Set <BagitWarning > warnings ,
68+ final Collection <BagitWarning > warningsToIgnore ) throws IOException , InvalidBagitFileFormatException , MaliciousPathException , UnsupportedAlgorithmException {
4769
4870 boolean missingTagManifest = true ;
71+ final List <Path > payloadManifests = new ArrayList <>();
72+ final List <Path > tagManifests = new ArrayList <>();
4973 try (final DirectoryStream <Path > files = Files .newDirectoryStream (bagitDir )){
5074 for (final Path file : files ){
51- final String filename = PathUtils .getFilename (file );
52- if (filename .contains ("manifest-" )){
53- if (filename .startsWith ("manifest-" )){
54- checkData (file , encoding , warnings , warningsToIgnore , true );
55- }
56- else {
57- checkData (file , encoding , warnings , warningsToIgnore , false );
58- missingTagManifest = false ;
59- }
60-
61- final String algorithm = filename .split ("[-\\ .]" )[1 ];
62- checkAlgorthm (algorithm , warnings , warningsToIgnore );
63- }
75+ missingTagManifest = missingTagManifest && checkManifest (file , payloadManifests , tagManifests , encoding , warnings , warningsToIgnore );
6476 }
6577 }
6678
79+ if (!warnings .contains (BagitWarning .MANIFEST_SETS_DIFFER )){
80+ checkManifestSets (version , tagManifests , payloadManifests , warnings , encoding );
81+ }
82+
6783 if (!warningsToIgnore .contains (BagitWarning .MISSING_TAG_MANIFEST ) && missingTagManifest ){
6884 logger .warn (messages .getString ("bag_missing_tag_manifest_warning" ), bagitDir );
6985 warnings .add (BagitWarning .MISSING_TAG_MANIFEST );
7086 }
7187 }
7288
89+ private static boolean checkManifest (final Path file , final List <Path > payloadManifests , final List <Path > tagManifests ,
90+ final Charset encoding , final Set <BagitWarning > warnings ,
91+ final Collection <BagitWarning > warningsToIgnore ) throws IOException , InvalidBagitFileFormatException {
92+ boolean missingTagManifest = true ;
93+ final String filename = PathUtils .getFilename (file );
94+ if (filename .contains ("manifest-" )){
95+ if (filename .startsWith ("manifest-" )){
96+ payloadManifests .add (file );
97+ checkManifestPayload (file , encoding , warnings , warningsToIgnore , true );
98+ }
99+ else {
100+ tagManifests .add (file );
101+ checkManifestPayload (file , encoding , warnings , warningsToIgnore , false );
102+ missingTagManifest = false ;
103+ }
104+
105+ final String algorithm = filename .split ("[-\\ .]" )[1 ];
106+ checkAlgorthm (algorithm , warnings , warningsToIgnore );
107+ }
108+
109+ return missingTagManifest ;
110+ }
111+
73112 /*
74- * Check for a "bag within a bag" and for relative paths in the manifests
113+ * Check for a "bag within a bag", relative paths, and OS specific files in the manifests
75114 */
76- private static void checkData (final Path manifestFile , final Charset encoding , final Set <BagitWarning > warnings , final Collection <BagitWarning > warningsToIgnore , final boolean isPayloadManifest ) throws IOException , InvalidBagitFileFormatException {
115+ private static void checkManifestPayload (final Path manifestFile , final Charset encoding , final Set <BagitWarning > warnings ,
116+ final Collection <BagitWarning > warningsToIgnore , final boolean isPayloadManifest )
117+ throws IOException , InvalidBagitFileFormatException {
118+
77119 try (final BufferedReader reader = Files .newBufferedReader (manifestFile , encoding )){
78120 final Set <String > paths = new HashSet <>();
79121
@@ -82,28 +124,24 @@ private static void checkData(final Path manifestFile, final Charset encoding, f
82124 String path = parsePath (line );
83125
84126 path = checkForManifestCreatedWithMD5SumTools (path , warnings , warningsToIgnore );
85-
86- if (!warningsToIgnore .contains (BagitWarning .DIFFERENT_CASE ) && paths .contains (path .toLowerCase ())){
87- logger .warn (messages .getString ("different_case_warning" ), manifestFile , path );
88- warnings .add (BagitWarning .DIFFERENT_CASE );
89- }
90127 paths .add (path .toLowerCase ());
91128
129+ checkForDifferentCase (path , paths , manifestFile , warnings , warningsToIgnore );
92130 if (encoding .name ().startsWith ("UTF" )){
93131 checkNormalization (path , manifestFile .getParent (), warnings , warningsToIgnore );
94132 }
95-
96133 checkForBagWithinBag (line , warnings , warningsToIgnore , isPayloadManifest );
97-
98134 checkForRelativePaths (line , warnings , warningsToIgnore , manifestFile );
99-
100135 checkForOSSpecificFiles (line , warnings , warningsToIgnore , manifestFile );
101136
102137 line = reader .readLine ();
103138 }
104139 }
105140 }
106141
142+ /*
143+ * Check to make sure it conforms to <hash> <path>
144+ */
107145 static String parsePath (final String line ) throws InvalidBagitFileFormatException {
108146 final String [] parts = line .split ("\\ s+" , 2 );
109147 if (parts .length < 2 ){
@@ -114,6 +152,9 @@ static String parsePath(final String line) throws InvalidBagitFileFormatExceptio
114152 return parts [1 ];
115153 }
116154
155+ /*
156+ * We allow for MD5sum tools for compatibility but it is not recommended
157+ */
117158 private static String checkForManifestCreatedWithMD5SumTools (final String path , final Set <BagitWarning > warnings , final Collection <BagitWarning > warningsToIgnore ){
118159 String fixedPath = path ;
119160 final boolean startsWithStar = path .charAt (0 ) == '*' ;
@@ -130,6 +171,17 @@ private static String checkForManifestCreatedWithMD5SumTools(final String path,
130171 return fixedPath ;
131172 }
132173
174+ /*
175+ * Check that the same line doesn't already exist in the set of paths
176+ */
177+ private static void checkForDifferentCase (final String path , final Set <String > paths , final Path manifestFile ,
178+ final Set <BagitWarning > warnings , final Collection <BagitWarning > warningsToIgnore ){
179+ if (!warningsToIgnore .contains (BagitWarning .DIFFERENT_CASE ) && paths .contains (path .toLowerCase ())){
180+ logger .warn (messages .getString ("different_case_warning" ), manifestFile , path );
181+ warnings .add (BagitWarning .DIFFERENT_CASE );
182+ }
183+ }
184+
133185 /*
134186 * Check that the file specified has not changed its normalization (i.e. have the bytes changed but it still looks the same?)
135187 */
@@ -210,6 +262,47 @@ else if(!warningsToIgnore.contains(BagitWarning.NON_STANDARD_ALGORITHM) && !"SHA
210262 warnings .add (BagitWarning .NON_STANDARD_ALGORITHM );
211263 }
212264 }
265+
266+ static void checkManifestSets (final Version version , final List <Path > tagManifests , final List <Path > payloadManifests ,
267+ final Set <BagitWarning > warnings , final Charset encoding )
268+ throws IOException , MaliciousPathException , UnsupportedAlgorithmException , InvalidBagitFileFormatException {
269+ //edge case, for version 1.0+ all tag manifests SHOULD list the same set of files
270+ if (tagManifests .size () > 1 && VERSION_1_0 .isSameOrOlder (version )){
271+ checkManifestsListSameSetOfFiles (warnings , tagManifests , encoding );
272+ }
273+
274+ //edge case, for version 1.0+ all payload manifests SHOULD list the same set of files
275+ if (payloadManifests .size () > 1 && VERSION_1_0 .isSameOrOlder (version )){
276+ checkManifestsListSameSetOfFiles (warnings , payloadManifests , encoding );
277+ }
278+ }
279+
280+ //starting with version 1.0 all manifest types (tag, payload) should list the same set of files
281+ @ SuppressWarnings ("PMD.EmptyCatchBlock" )
282+ static void checkManifestsListSameSetOfFiles (final Set <BagitWarning > warnings , final List <Path > manifestPaths , final Charset charset ) throws IOException , MaliciousPathException , UnsupportedAlgorithmException , InvalidBagitFileFormatException {
283+ final StandardBagitAlgorithmNameToSupportedAlgorithmMapping nameMapping = new StandardBagitAlgorithmNameToSupportedAlgorithmMapping ();
284+
285+ Manifest compareToManifest = null ;
286+ Path compareToManifestPath = null ;
287+ for (final Path manifestPath : manifestPaths ) {
288+ try {
289+ final Manifest manifest = ManifestReader .readManifest (nameMapping , manifestPath , manifestPath .getParent (), charset );
290+ if (compareToManifest == null ) {
291+ compareToManifestPath = manifestPath ;
292+ compareToManifest = manifest ;
293+ continue ;
294+ }
295+
296+ if (!compareToManifest .getFileToChecksumMap ().keySet ().equals (manifest .getFileToChecksumMap ().keySet ())) {
297+ logger .warn (messages .getString ("manifest_fileset_differ" ), compareToManifestPath , manifestPath );
298+ warnings .add (BagitWarning .MANIFEST_SETS_DIFFER );
299+ }
300+ }
301+ catch (UnsupportedAlgorithmException e ) {
302+ //ignore an unsupported algorithm as it is caught in checkAlgorthm()
303+ }
304+ }
305+ }
213306
214307 //for unit test only
215308 static String getOsFilesRegex () {
0 commit comments