From 3652916e0b741509119c77ba8d407ff409b5fa6c Mon Sep 17 00:00:00 2001 From: Hugo Hurskainen Date: Wed, 10 Jun 2026 12:21:21 +0300 Subject: [PATCH 1/3] feat: add --store-hash to store content hashes in ltfs.hash.* VEAs --- man/ltfs_ordered_copy.1 | 6 ++ man/sgml/ltfs_ordered_copy.sgml | 12 ++++ src/utils/ltfs_ordered_copy | 104 ++++++++++++++++++++++++++++++-- 3 files changed, 117 insertions(+), 5 deletions(-) diff --git a/man/ltfs_ordered_copy.1 b/man/ltfs_ordered_copy.1 index 0444daee..477fd6a1 100644 --- a/man/ltfs_ordered_copy.1 +++ b/man/ltfs_ordered_copy.1 @@ -40,6 +40,12 @@ Configure verbosity of logger. VERBOSE shall be 0-6. (Default: 4) .TP \fB-q, --quiet\fR No message outout +.TP +\fB--store-hash\fR +Compute a content hash of each copied file and store it in the \fBltfs.hash.\fR\fIALGO\fR extended attribute on the destination. This is intended for LTFS destinations, which persist the value into the index (LTFS Format Spec 2.4). The hash is computed from the source file content and stored as a hex string; if the hash cannot be stored the file is treated as failed. +.TP +\fB--hash-algo\fR \fIALGO\fR +Hash algorithm to use with \fB--store-hash\fR. Defaults to \fBsha256\fR and may be any algorithm guaranteed by the Python \fBhashlib\fR module (on Python 3: md5, sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2s; on Python 2.7 the classic md5/sha1/sha224/sha256/sha384/sha512). .SH "COMMAND EXAMPLES" .PP This section shows various command examples. diff --git a/man/sgml/ltfs_ordered_copy.sgml b/man/sgml/ltfs_ordered_copy.sgml index 46f52406..ad883e87 100644 --- a/man/sgml/ltfs_ordered_copy.sgml +++ b/man/sgml/ltfs_ordered_copy.sgml @@ -104,6 +104,18 @@ No message outout + + + + Compute a content hash of each copied file and store it in the ltfs.hash.ALGO extended attribute on the destination. This is intended for LTFS destinations, which persist the value into the index (LTFS Format Spec 2.4). The hash is computed from the source file content and stored as a hex string; if the hash cannot be stored the file is treated as failed. + + + + ALGO + + Hash algorithm to use with . Defaults to sha256 and may be any algorithm guaranteed by the Python hashlib module (on Python 3: md5, sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2s; on Python 2.7 the classic md5/sha1/sha224/sha256/sha384/sha512). + + diff --git a/src/utils/ltfs_ordered_copy b/src/utils/ltfs_ordered_copy index 5537964e..5b45a012 100755 --- a/src/utils/ltfs_ordered_copy +++ b/src/utils/ltfs_ordered_copy @@ -40,18 +40,65 @@ import argparse import xattr import shutil import threading +import hashlib from logging import getLogger, basicConfig, NOTSET, CRITICAL, ERROR, WARNING, INFO, DEBUG from collections import deque +# hashlib.algorithms_guaranteed is Python 3.2+; Python 2.7 exposes hashlib.algorithms +# instead. Use whichever exists so the script runs on both 2.7 and 3.x. (The "or" only +# evaluates the 2.7 fallback when algorithms_guaranteed is absent, so it never raises on 3.x.) +HASH_ALGORITHMS = getattr(hashlib, 'algorithms_guaranteed', None) or set(hashlib.algorithms) + +def compute_file_hash(path, algo): + """Stream a file and return its hex digest using the named hashlib algorithm.""" + h = hashlib.new(algo) + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1024 * 1024), b''): + h.update(chunk) + if algo.startswith('shake_'): + # SHAKE is an extendable-output function; emit a fixed-length 32-byte digest. + return h.hexdigest(32) + return h.hexdigest() + +def ensure_ltfs_hash_supported(probe_path, logger): + """When --store-hash targets an LTFS volume, require LTFS Format Spec >= 2.4 (the + version that introduced the stored ltfs.hash.* VEA), aborting on an older LTFS + volume. Non-LTFS destinations are left alone -- there ltfs.hash.* is just a plain + user extended attribute.""" + try: + sig = xattr.get(probe_path, VEA_PREFIX + LTFS_SIG_VEA) + except Exception: + return # No LTFS signature: not an LTFS destination, nothing to gate. + if isinstance(sig, bytes): + sig = sig.decode('ascii', 'replace') + if not sig.startswith('LTFS'): + return + try: + spec = xattr.get(probe_path, VEA_PREFIX + 'ltfs.softwareFormatSpec') + if isinstance(spec, bytes): + spec = spec.decode('ascii', 'replace') + nums = [int(x) for x in spec.strip().split('.')[:2]] + version = (nums[0], nums[1] if len(nums) > 1 else 0) + except Exception as e: + logger.error("--store-hash: cannot determine the LTFS format spec version of '{0}': {1}".format(probe_path, str(e))) + exit(2) + if version < (2, 4): + logger.error("--store-hash: destination LTFS Format Spec {0} is older than 2.4, which is " + "required to store ltfs.hash.* attributes. Omit --store-hash or use a 2.4+ " + "LTFS volume.".format(spec)) + exit(2) + logger.log(NOTSET + 1, "Destination LTFS Format Spec {0} supports ltfs.hash.* (>= 2.4)".format(spec)) + class CopyItem: """""" - def __init__(self, src, dst, vea_pre, cp_attr, cp_xattr, logger): #initialization + def __init__(self, src, dst, vea_pre, cp_attr, cp_xattr, logger, store_hash=None): #initialization self.src = src self.dst = dst self.vea_pre = vea_pre self.cp_attr = cp_attr self.cp_xattr = cp_xattr + self.store_hash = store_hash self.vuuid = '' self.part = '' self.start = -1 @@ -101,6 +148,20 @@ class CopyItem: self.logger.error('Failed to copy "{0}" to "{1}": {2}'.format(self.src, self.dst, str(str(e)))) return False + if self.store_hash: + # Store the content hash in the ltfs.hash. VEA on the destination. + # The hash is computed from the (on-disk) source, whose bytes are identical + # to what was just copied; on LTFS this is persisted into the index. + try: + target = self.dst + if os.path.isdir(target): + target = os.path.join(target, os.path.basename(self.src)) + digest = compute_file_hash(self.src, self.store_hash) + xattr.set(target, self.vea_pre + 'ltfs.hash.' + self.store_hash, digest.encode('ascii')) + except Exception as e: + self.logger.error('Copied "{0}" to "{1}" but failed to store {2} hash: {3}'.format(self.src, self.dst, self.store_hash, str(e))) + return False + return True def __repr__(self): @@ -151,7 +212,7 @@ class CopyQueue: self.items = self.items + 1 - def walk_dir(self, source, dest, cp_attr, cp_xattr=False): + def walk_dir(self, source, dest, cp_attr, cp_xattr=False, store_hash=None): (source_root, t) = os.path.split(source) prefix_len = len(source_root) dst = dest + "/" + t @@ -171,7 +232,7 @@ class CopyQueue: for f in sorted(files) if self.sort_files else files: self.logger.log(NOTSET + 1, 'Creating a copy item for file {}'.format(f)) c = CopyItem(os.path.join(root, f), os.path.join(dst, f), VEA_PREFIX, - cp_attr, cp_xattr, logger) + cp_attr, cp_xattr, logger, store_hash) self.add_copy_item(c) for d in walk_dirs: @@ -280,6 +341,14 @@ parser.add_argument('-v', help='Verbose output. Set VERBOSE level 5', action='st parser.add_argument('--verbose', help='Configure verbosity of logger. VERBOSE shall be 0-6. default is 4', default = str(logger_info)) parser.add_argument('-q','--quiet', help='No message output', action='store_true') parser.add_argument('--sort-files', help='Sort the file list before copying', action='store_true') +parser.add_argument('--store-hash', action='store_true', + help='Compute a content hash of each copied file and store it in the ' + 'ltfs.hash. extended attribute on the destination (intended ' + 'for LTFS destinations, which persist it in the index per LTFS Format ' + 'Spec 2.4). The algorithm is selected with --hash-algo (default sha256).') +parser.add_argument('--hash-algo', default='sha256', metavar='ALGO', + help='Hash algorithm to use with --store-hash. Default sha256. Available: ' + + ', '.join(sorted(HASH_ALGORITHMS)) + '.') args=parser.parse_args() @@ -318,6 +387,18 @@ else: logger.info('Tape order aware copy for LTFS') +# Resolve --store-hash / --hash-algo into a single value: the algorithm name when +# hashing is enabled, otherwise None. Downstream code treats it as "algo or falsy". +if args.store_hash: + algo = args.hash_algo.lower() + if algo not in HASH_ALGORITHMS: + logger.error("Unsupported hash algorithm '{0}'. Available: {1}".format( + algo, ', '.join(sorted(HASH_ALGORITHMS)))) + exit(2) + args.store_hash = algo +else: + args.store_hash = None + if args.target_directory: if args.DEST != None: args.SOURCE.extend(args.DEST) @@ -336,6 +417,13 @@ if args.DEST == None: logger.error('No destination is specified') exit(2) +if args.store_hash: + # ltfs.hash.* is a stored VEA introduced in LTFS Format Spec 2.4. If the + # destination is on an LTFS volume, verify it is new enough up front so we + # fail fast instead of erroring on every single file. + hash_probe = args.DEST if os.path.isdir(args.DEST) else (os.path.dirname(args.DEST) or '.') + ensure_ltfs_hash_supported(hash_probe, logger) + # Special case: # Copy source is only one file if args.recursive == False and len(args.SOURCE) == 1: @@ -349,6 +437,12 @@ if args.recursive == False and len(args.SOURCE) == 1: if not os.path.exists(new_d): os.makedirs(new_d) shutil.copy(args.SOURCE[0], args.DEST) + if args.store_hash: + target = args.DEST + if os.path.isdir(target): + target = os.path.join(target, os.path.basename(args.SOURCE[0])) + digest = compute_file_hash(args.SOURCE[0], args.store_hash) + xattr.set(target, VEA_PREFIX + 'ltfs.hash.' + args.store_hash, digest.encode('ascii')) except Exception as e: logger.error(str(e)) exit(1) @@ -402,7 +496,7 @@ for s in args.SOURCE: (new_d, t) = os.path.split(dst) if not os.path.exists(new_d): os.makedirs(new_d) - c = CopyItem(s, dst, VEA_PREFIX, args.p, args.all, logger) + c = CopyItem(s, dst, VEA_PREFIX, args.p, args.all, logger, args.store_hash) copyq.add_copy_item(c) else: logger.log(NOTSET + 1, 'Creating copy item for directory {}'.format(s)) @@ -414,7 +508,7 @@ for s in args.SOURCE: if not os.path.exists(new_d): os.makedirs(new_d) dst = new_d - copyq.walk_dir(s, dst, args.p, args.all) + copyq.walk_dir(s, dst, args.p, args.all, args.store_hash) else: logger.warning("omitting directory '{0}'".format(s)) From 8b60ef002dfb5740ddc0c53b7f35aa084cf1ea11 Mon Sep 17 00:00:00 2001 From: Hugo Hurskainen Date: Fri, 12 Jun 2026 19:34:25 +0300 Subject: [PATCH 2/3] fix: use standardized ltfs.hash. names from LTFS Format Spec 2.4 (Table F.1) --- man/ltfs_ordered_copy.1 | 6 +-- man/sgml/ltfs_ordered_copy.sgml | 6 +-- src/utils/ltfs_ordered_copy | 69 ++++++++++++++++++++++----------- 3 files changed, 52 insertions(+), 29 deletions(-) diff --git a/man/ltfs_ordered_copy.1 b/man/ltfs_ordered_copy.1 index 477fd6a1..afc3e2e7 100644 --- a/man/ltfs_ordered_copy.1 +++ b/man/ltfs_ordered_copy.1 @@ -42,10 +42,10 @@ Configure verbosity of logger. VERBOSE shall be 0-6. (Default: 4) No message outout .TP \fB--store-hash\fR -Compute a content hash of each copied file and store it in the \fBltfs.hash.\fR\fIALGO\fR extended attribute on the destination. This is intended for LTFS destinations, which persist the value into the index (LTFS Format Spec 2.4). The hash is computed from the source file content and stored as a hex string; if the hash cannot be stored the file is treated as failed. +Compute a content hash of each copied file and store it in the \fBltfs.hash.\fR\fIHASHTYPE\fR extended attribute on the destination, as defined by the LTFS Format Specification 2.4 (Annex F, Table F.1). This is intended for LTFS destinations, which persist the value into the index. The hash is stored as a UTF-8 hex string of the length the spec requires for the type; if the hash cannot be stored the file is treated as failed. .TP -\fB--hash-algo\fR \fIALGO\fR -Hash algorithm to use with \fB--store-hash\fR. Defaults to \fBsha256\fR and may be any algorithm guaranteed by the Python \fBhashlib\fR module (on Python 3: md5, sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2s; on Python 2.7 the classic md5/sha1/sha224/sha256/sha384/sha512). +\fB--hash-algo\fR \fIHASHTYPE\fR +LTFS hash type to use with \fB--store-hash\fR (LTFS Format Spec 2.4, Table F.1). One of \fBcrc32sum\fR, \fBmd5sum\fR, \fBsha1sum\fR, \fBsha256sum\fR, \fBsha512sum\fR; the bare names \fBcrc32\fR, \fBmd5\fR, \fBsha1\fR, \fBsha256\fR, \fBsha512\fR are accepted as aliases. Defaults to \fBsha256sum\fR. .SH "COMMAND EXAMPLES" .PP This section shows various command examples. diff --git a/man/sgml/ltfs_ordered_copy.sgml b/man/sgml/ltfs_ordered_copy.sgml index ad883e87..b3b1223b 100644 --- a/man/sgml/ltfs_ordered_copy.sgml +++ b/man/sgml/ltfs_ordered_copy.sgml @@ -107,13 +107,13 @@ - Compute a content hash of each copied file and store it in the ltfs.hash.ALGO extended attribute on the destination. This is intended for LTFS destinations, which persist the value into the index (LTFS Format Spec 2.4). The hash is computed from the source file content and stored as a hex string; if the hash cannot be stored the file is treated as failed. + Compute a content hash of each copied file and store it in the ltfs.hash.HASHTYPE extended attribute on the destination, as defined by the LTFS Format Specification 2.4 (Annex F, Table F.1). This is intended for LTFS destinations, which persist the value into the index. The hash is stored as a UTF-8 hex string of the length the spec requires for the type; if the hash cannot be stored the file is treated as failed. - ALGO + HASHTYPE - Hash algorithm to use with . Defaults to sha256 and may be any algorithm guaranteed by the Python hashlib module (on Python 3: md5, sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2s; on Python 2.7 the classic md5/sha1/sha224/sha256/sha384/sha512). + LTFS hash type to use with (LTFS Format Spec 2.4, Table F.1). One of crc32sum, md5sum, sha1sum, sha256sum, sha512sum; the bare names crc32, md5, sha1, sha256, sha512 are accepted as aliases. Defaults to sha256sum. diff --git a/src/utils/ltfs_ordered_copy b/src/utils/ltfs_ordered_copy index 5b45a012..ac5cda7a 100755 --- a/src/utils/ltfs_ordered_copy +++ b/src/utils/ltfs_ordered_copy @@ -41,24 +41,42 @@ import xattr import shutil import threading import hashlib +import zlib from logging import getLogger, basicConfig, NOTSET, CRITICAL, ERROR, WARNING, INFO, DEBUG from collections import deque -# hashlib.algorithms_guaranteed is Python 3.2+; Python 2.7 exposes hashlib.algorithms -# instead. Use whichever exists so the script runs on both 2.7 and 3.x. (The "or" only -# evaluates the 2.7 fallback when algorithms_guaranteed is absent, so it never raises on 3.x.) -HASH_ALGORITHMS = getattr(hashlib, 'algorithms_guaranteed', None) or set(hashlib.algorithms) - -def compute_file_hash(path, algo): - """Stream a file and return its hex digest using the named hashlib algorithm.""" +# LTFS Format Specification 2.4, Annex F (Table F.1): file content hashes are stored as +# ltfs.hash. extended attributes. Only these hashtypes are standardized; all +# other hashtype values are reserved by the spec. Each maps to its underlying algorithm +# (all of which are available on both Python 2.7 and 3.x). +HASH_TYPES = { + 'crc32sum': 'crc32', + 'md5sum': 'md5', + 'sha1sum': 'sha1', + 'sha256sum': 'sha256', + 'sha512sum': 'sha512', +} +# Accept the bare algorithm name as a convenience alias for the spec hashtype. +HASH_ALIASES = { + 'crc32': 'crc32sum', 'md5': 'md5sum', 'sha1': 'sha1sum', + 'sha256': 'sha256sum', 'sha512': 'sha512sum', +} + +def compute_file_hash(path, hashtype): + """Stream a file and return the hex digest for an LTFS Table F.1 hashtype (e.g. + 'sha256sum'), as a UTF-8 hex string of the length the spec requires for that type.""" + algo = HASH_TYPES[hashtype] + if algo == 'crc32': + crc = 0 + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1024 * 1024), b''): + crc = zlib.crc32(chunk, crc) + return '%08x' % (crc & 0xffffffff) h = hashlib.new(algo) with open(path, 'rb') as f: for chunk in iter(lambda: f.read(1024 * 1024), b''): h.update(chunk) - if algo.startswith('shake_'): - # SHAKE is an extendable-output function; emit a fixed-length 32-byte digest. - return h.hexdigest(32) return h.hexdigest() def ensure_ltfs_hash_supported(probe_path, logger): @@ -149,7 +167,7 @@ class CopyItem: return False if self.store_hash: - # Store the content hash in the ltfs.hash. VEA on the destination. + # Store the content hash in the ltfs.hash. VEA on the destination. # The hash is computed from the (on-disk) source, whose bytes are identical # to what was just copied; on LTFS this is persisted into the index. try: @@ -343,12 +361,16 @@ parser.add_argument('-q','--quiet', help='No message output', action='store_true parser.add_argument('--sort-files', help='Sort the file list before copying', action='store_true') parser.add_argument('--store-hash', action='store_true', help='Compute a content hash of each copied file and store it in the ' - 'ltfs.hash. extended attribute on the destination (intended ' + 'ltfs.hash. extended attribute on the destination (intended ' 'for LTFS destinations, which persist it in the index per LTFS Format ' - 'Spec 2.4). The algorithm is selected with --hash-algo (default sha256).') -parser.add_argument('--hash-algo', default='sha256', metavar='ALGO', - help='Hash algorithm to use with --store-hash. Default sha256. Available: ' - + ', '.join(sorted(HASH_ALGORITHMS)) + '.') + 'Spec 2.4, Table F.1). The hash type is selected with --hash-algo ' + '(default sha256sum).') +parser.add_argument('--hash-algo', default='sha256sum', metavar='HASHTYPE', + help='LTFS hash type to use with --store-hash, stored as ltfs.hash. ' + '(LTFS Format Spec 2.4, Table F.1). One of: ' + + ', '.join(sorted(HASH_TYPES)) + + ' (the bare names ' + ', '.join(sorted(HASH_ALIASES)) + + ' are accepted as aliases). Default sha256sum.') args=parser.parse_args() @@ -387,15 +409,16 @@ else: logger.info('Tape order aware copy for LTFS') -# Resolve --store-hash / --hash-algo into a single value: the algorithm name when -# hashing is enabled, otherwise None. Downstream code treats it as "algo or falsy". +# Resolve --store-hash / --hash-algo into a single value: the LTFS spec hashtype when +# hashing is enabled, otherwise None. Downstream code treats it as "hashtype or falsy". if args.store_hash: - algo = args.hash_algo.lower() - if algo not in HASH_ALGORITHMS: - logger.error("Unsupported hash algorithm '{0}'. Available: {1}".format( - algo, ', '.join(sorted(HASH_ALGORITHMS)))) + ht = args.hash_algo.lower() + ht = HASH_ALIASES.get(ht, ht) # accept a bare algorithm name as an alias + if ht not in HASH_TYPES: + logger.error("Unsupported hash type '{0}'. LTFS Format Spec 2.4 (Table F.1) defines: {1}.".format( + args.hash_algo, ', '.join(sorted(HASH_TYPES)))) exit(2) - args.store_hash = algo + args.store_hash = ht else: args.store_hash = None From e989d9ba7836c7304e6c4a4abcf3c83526f10c72 Mon Sep 17 00:00:00 2001 From: Hugo Hurskainen Date: Sat, 13 Jun 2026 00:22:11 +0300 Subject: [PATCH 3/3] feat: add --store-hash-all to write all Table F.1 hash types (parallelized) --- man/ltfs_ordered_copy.1 | 3 ++ man/sgml/ltfs_ordered_copy.sgml | 6 +++ src/utils/ltfs_ordered_copy | 86 ++++++++++++++++++++++++++++----- 3 files changed, 83 insertions(+), 12 deletions(-) diff --git a/man/ltfs_ordered_copy.1 b/man/ltfs_ordered_copy.1 index afc3e2e7..04051a54 100644 --- a/man/ltfs_ordered_copy.1 +++ b/man/ltfs_ordered_copy.1 @@ -46,6 +46,9 @@ Compute a content hash of each copied file and store it in the \fBltfs.hash.\fR\ .TP \fB--hash-algo\fR \fIHASHTYPE\fR LTFS hash type to use with \fB--store-hash\fR (LTFS Format Spec 2.4, Table F.1). One of \fBcrc32sum\fR, \fBmd5sum\fR, \fBsha1sum\fR, \fBsha256sum\fR, \fBsha512sum\fR; the bare names \fBcrc32\fR, \fBmd5\fR, \fBsha1\fR, \fBsha256\fR, \fBsha512\fR are accepted as aliases. Defaults to \fBsha256sum\fR. +.TP +\fB--store-hash-all\fR +Like \fB--store-hash\fR, but compute and store every standardized LTFS hash type (\fBcrc32sum\fR, \fBmd5sum\fR, \fBsha1sum\fR, \fBsha256sum\fR, \fBsha512sum\fR) for each file. Each file is read once and the hash types are computed in parallel, one worker thread per type. Overrides \fB--hash-algo\fR. .SH "COMMAND EXAMPLES" .PP This section shows various command examples. diff --git a/man/sgml/ltfs_ordered_copy.sgml b/man/sgml/ltfs_ordered_copy.sgml index b3b1223b..c938b56d 100644 --- a/man/sgml/ltfs_ordered_copy.sgml +++ b/man/sgml/ltfs_ordered_copy.sgml @@ -116,6 +116,12 @@ LTFS hash type to use with (LTFS Format Spec 2.4, Table F.1). One of crc32sum, md5sum, sha1sum, sha256sum, sha512sum; the bare names crc32, md5, sha1, sha256, sha512 are accepted as aliases. Defaults to sha256sum. + + + + Like , but compute and store every standardized LTFS hash type (crc32sum, md5sum, sha1sum, sha256sum, sha512sum) for each file. Each file is read once and the hash types are computed in parallel, one worker thread per type. Overrides . + + diff --git a/src/utils/ltfs_ordered_copy b/src/utils/ltfs_ordered_copy index ac5cda7a..3c655f21 100755 --- a/src/utils/ltfs_ordered_copy +++ b/src/utils/ltfs_ordered_copy @@ -79,6 +79,57 @@ def compute_file_hash(path, hashtype): h.update(chunk) return h.hexdigest() +def compute_file_hashes(path, hashtypes): + """Compute one or more LTFS Table F.1 hashtypes for a file and return + {hashtype: hexdigest}. The file is read only once. With several hashtypes each is + updated in its own worker thread fed by a single reader; hashlib and zlib release + the GIL during their update calls, so the per-type work runs across CPU cores in + parallel.""" + if len(hashtypes) == 1: + ht = hashtypes[0] + return {ht: compute_file_hash(path, ht)} + + try: + from queue import Queue # Python 3 + except ImportError: + from Queue import Queue # Python 2 + + queues = dict((ht, Queue(maxsize=8)) for ht in hashtypes) + results = {} + + def worker(ht): + q = queues[ht] + algo = HASH_TYPES[ht] + if algo == 'crc32': + crc = 0 + chunk = q.get() + while chunk is not None: + crc = zlib.crc32(chunk, crc) + chunk = q.get() + results[ht] = '%08x' % (crc & 0xffffffff) + else: + h = hashlib.new(algo) + chunk = q.get() + while chunk is not None: + h.update(chunk) + chunk = q.get() + results[ht] = h.hexdigest() + + threads = [threading.Thread(target=worker, args=(ht,)) for ht in hashtypes] + for t in threads: + t.start() + try: + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1024 * 1024), b''): + for q in queues.values(): + q.put(chunk) + finally: + for q in queues.values(): + q.put(None) # signal end-of-stream to every worker + for t in threads: + t.join() + return results + def ensure_ltfs_hash_supported(probe_path, logger): """When --store-hash targets an LTFS volume, require LTFS Format Spec >= 2.4 (the version that introduced the stored ltfs.hash.* VEA), aborting on an older LTFS @@ -167,17 +218,19 @@ class CopyItem: return False if self.store_hash: - # Store the content hash in the ltfs.hash. VEA on the destination. - # The hash is computed from the (on-disk) source, whose bytes are identical - # to what was just copied; on LTFS this is persisted into the index. + # Store one ltfs.hash. VEA per requested hashtype on the + # destination. The hash(es) are computed from the (on-disk) source, whose + # bytes are identical to what was just copied; on LTFS this is persisted + # into the index. try: target = self.dst if os.path.isdir(target): target = os.path.join(target, os.path.basename(self.src)) - digest = compute_file_hash(self.src, self.store_hash) - xattr.set(target, self.vea_pre + 'ltfs.hash.' + self.store_hash, digest.encode('ascii')) + digests = compute_file_hashes(self.src, self.store_hash) + for ht in self.store_hash: + xattr.set(target, self.vea_pre + 'ltfs.hash.' + ht, digests[ht].encode('ascii')) except Exception as e: - self.logger.error('Copied "{0}" to "{1}" but failed to store {2} hash: {3}'.format(self.src, self.dst, self.store_hash, str(e))) + self.logger.error('Copied "{0}" to "{1}" but failed to store hash(es): {2}'.format(self.src, self.dst, str(e))) return False return True @@ -371,6 +424,11 @@ parser.add_argument('--hash-algo', default='sha256sum', metavar='HASHTYPE', + ', '.join(sorted(HASH_TYPES)) + ' (the bare names ' + ', '.join(sorted(HASH_ALIASES)) + ' are accepted as aliases). Default sha256sum.') +parser.add_argument('--store-hash-all', action='store_true', + help='Like --store-hash, but compute and store every standardized LTFS ' + 'hash type (' + ', '.join(sorted(HASH_TYPES)) + ') for each file. ' + 'The file is read once and the hash types are computed in parallel ' + '(one worker thread each). Overrides --hash-algo.') args=parser.parse_args() @@ -409,16 +467,19 @@ else: logger.info('Tape order aware copy for LTFS') -# Resolve --store-hash / --hash-algo into a single value: the LTFS spec hashtype when -# hashing is enabled, otherwise None. Downstream code treats it as "hashtype or falsy". -if args.store_hash: +# Resolve --store-hash / --store-hash-all / --hash-algo into args.store_hash: a list of +# LTFS spec hashtypes to compute when hashing is enabled, otherwise None. Downstream code +# treats it as "list of hashtypes or falsy". +if args.store_hash_all: + args.store_hash = sorted(HASH_TYPES) # every standardized hashtype +elif args.store_hash: ht = args.hash_algo.lower() ht = HASH_ALIASES.get(ht, ht) # accept a bare algorithm name as an alias if ht not in HASH_TYPES: logger.error("Unsupported hash type '{0}'. LTFS Format Spec 2.4 (Table F.1) defines: {1}.".format( args.hash_algo, ', '.join(sorted(HASH_TYPES)))) exit(2) - args.store_hash = ht + args.store_hash = [ht] else: args.store_hash = None @@ -464,8 +525,9 @@ if args.recursive == False and len(args.SOURCE) == 1: target = args.DEST if os.path.isdir(target): target = os.path.join(target, os.path.basename(args.SOURCE[0])) - digest = compute_file_hash(args.SOURCE[0], args.store_hash) - xattr.set(target, VEA_PREFIX + 'ltfs.hash.' + args.store_hash, digest.encode('ascii')) + digests = compute_file_hashes(args.SOURCE[0], args.store_hash) + for ht in args.store_hash: + xattr.set(target, VEA_PREFIX + 'ltfs.hash.' + ht, digests[ht].encode('ascii')) except Exception as e: logger.error(str(e)) exit(1)