diff --git a/docs/source/overview.parallelisation.rst b/docs/source/overview.parallelisation.rst
index f0388195..d9f879a8 100644
--- a/docs/source/overview.parallelisation.rst
+++ b/docs/source/overview.parallelisation.rst
@@ -68,7 +68,10 @@ created using `multiprocessing.pool.Pool
 The number of threads is controlled by an integer, passed in to the
 function as an optional ``num_python_subprocesses`` argument, or stored
 in the env var ``WFL_NUM_PYTHON_SUBPROCESSES``.  The script should be
-started with a normal run of the python executable.
+started with a normal run of the python executable. Setting
+the ``WFL_TORCH_N_GPUS`` env var to the number of GPUs
+causes ``wfl`` to call ``torch.cuda.set_device()`` for each subprocess
+ensuring that it gets a unique GPU from the other subprocesses.
 
 
 ========================================
diff --git a/tests/local_scripts/complete_pytest.tin b/tests/local_scripts/complete_pytest.tin
index 2d1f7060..c1f98f65 100755
--- a/tests/local_scripts/complete_pytest.tin
+++ b/tests/local_scripts/complete_pytest.tin
@@ -2,17 +2,40 @@
 
 module purge
 # module load compiler/gnu python/system python_extras/quippy lapack/mkl
-module load compiler/gnu python python_extras/quippy lapack/mkl
+module load compiler/gnu python python_extras/structure python_extras/quippy lapack/mkl
 # for wfl dependencies
 module load python_extras/wif
 module load python_extras/torch/cpu
 
+if [ ! -z "$WFL_PYTEST_POST_MODULE_COMMANDS" ]; then
+    echo "Using WFL_PYTEST_POST_MODULE_COMMANDS '$WFL_PYTEST_POST_MODULE_COMMANDS'" 1>&2
+    eval $WFL_PYTEST_POST_MODULE_COMMANDS
+else
+    echo "Using no WFL_PYTEST_POST_MODULE_COMMANDS" 1>&2
+fi
+
 if [ -z "$WFL_PYTEST_EXPYRE_INFO" ]; then
     echo "To override partition used, set WFL_PYTEST_EXPYRE_INFO='{\"resources\" : {\"partitions\": \"DESIRED_PARTITION\"}}'" 1>&2
 fi
+WFL_PYTEST_EXPYRE_INFO=$(
+cat << EOF | python3
+import json, os
+i = {"pre_cmds": ["module purge",
+                  "module load compiler/gnu lapack/mkl python python_extras/structure python_extras/quippy python_extras/wif dft/vasp dft/pwscf",
+                  "module list"]}
+ienv = json.loads(os.environ.get("WFL_PYTEST_EXPYRE_INFO", "{}"))
+i.update(ienv)
+print(json.dumps(i))
+EOF
+)
+export WFL_PYTEST_EXPYRE_INFO
+echo "Using WFL_PYTEST_EXPYRE_INFO '$WFL_PYTEST_EXPYRE_INFO'" 1>&2
 
 if [ ! -z $WFL_PYTHONPATH_EXTRA ]; then
+    echo "Adding WFL_PYTHONPATH_EXTRA '$WFL_PYTHONPATH_EXTRA'" 1>&2
     export PYTHONPATH=${WFL_PYTHONPATH_EXTRA}:${PYTHONPATH}
+else
+    echo "Adding no WFL_PYTHONPATH_EXTRA" 1>&2
 fi
 
 export JULIA_PROJECT=${PWD}/tests/assets/julia
@@ -29,7 +52,7 @@ echo "" >> complete_pytest.tin.out
 # buildcell
 export WFL_PYTEST_BUILDCELL=$HOME/src/work/AIRSS/airss-0.9.1/src/buildcell/src/buildcell
 # VASP
-module load dft/vasp
+module load dft/vasp/serial
 export ASE_VASP_COMMAND=vasp.serial
 export ASE_VASP_COMMAND_GAMMA=vasp.gamma.serial
 export PYTEST_VASP_POTCAR_DIR=$VASP_PATH/pot/rev_54/PBE
@@ -37,6 +60,8 @@ export PYTEST_VASP_POTCAR_DIR=$VASP_PATH/pot/rev_54/PBE
 module load dft/pwscf
 # no ORCA
 
+module list
+
 export OPENBLAS_NUM_THREADS=1
 export MKL_NUM_THREADS=1
 # required for descriptor calc to not hang
@@ -70,7 +95,7 @@ l=`egrep '^=.*(passed|failed|skipped|xfailed|error).* in ' complete_pytest.tin.o
 echo "summary line $l"
 lp=$( echo $l | sed -E -e 's/ in .*//' -e 's/\s*,\s*/\n/g' )
 
-declare -A expected_n=( ["passed"]="177" ["skipped"]="21" ["warnings"]=823 ["xfailed"]=2 ["xpassed"]=1 )
+declare -A expected_n=( ["passed"]="188" ["skipped"]="26" ["warnings"]=1068 ["xfailed"]=1 )
 IFS=$'\n'
 t_stat=0
 for out in $lp; do
diff --git a/tests/test_autoparallelize.py b/tests/test_autoparallelize.py
index 76747794..f2586c8f 100644
--- a/tests/test_autoparallelize.py
+++ b/tests/test_autoparallelize.py
@@ -1,4 +1,5 @@
 import pytest
+import os
 import time
 
 import numpy as np
@@ -12,6 +13,12 @@
 from wfl.calculators import generic
 from wfl.autoparallelize import AutoparaInfo
 
+try:
+    import torch
+    from mace.calculators.foundations_models import mace_mp
+except ImportError:
+    torch = None
+
 
 def test_empty_iterator(tmp_path):
     co = buildcell.buildcell(range(0), OutputSpec(tmp_path / 'dummy.xyz'),  buildcell_cmd='dummy', buildcell_input='dummy')
@@ -35,21 +42,71 @@ def test_autopara_info_dict():
 def test_pool_speedup():
     np.random.seed(5)
 
+    rng = np.random.default_rng(5)
     ats = []
     nconf = 60
+    at_prim = Atoms('Al', cell=[1, 1, 1], pbc=[True] * 3)
     for _ in range(nconf):
-        ats.append(Atoms(['Al'] * nconf, scaled_positions=np.random.uniform(size=(nconf, 3)), cell=[10, 10, 10], pbc=[True] * 3))
+        ats.append(at_prim * (4, 4, 4))
+        ats[-1].rattle(rng=rng)
 
     t0 = time.time()
-    co = generic.calculate(ConfigSet(ats), OutputSpec(), EMT(), output_prefix="_auto_", autopara_info=AutoparaInfo(num_python_subprocesses=1))
+    co = generic.calculate(ConfigSet(ats), OutputSpec(), EMT(), output_prefix="_auto_",
+                           autopara_info=AutoparaInfo(num_python_subprocesses=1,
+                                                      num_inputs_per_python_subprocess=30))
     dt_1 = time.time() - t0
 
     t0 = time.time()
-    co = generic.calculate(ConfigSet(ats), OutputSpec(), EMT(), output_prefix="_auto_", autopara_info=AutoparaInfo(num_python_subprocesses=2))
+    co = generic.calculate(ConfigSet(ats), OutputSpec(), EMT(), output_prefix="_auto_",
+                           autopara_info=AutoparaInfo(num_python_subprocesses=2,
+                                                      num_inputs_per_python_subprocess=30))
     dt_2 = time.time() - t0
 
     print("time ratio", dt_2 / dt_1)
-    assert dt_2 < dt_1 * (2/3)
+    assert dt_2 / dt_1 < 0.75
+
+
+@pytest.mark.skipif(torch is None or not torch.cuda.is_available() or os.environ.get("WFL_TORCH_N_GPUS") is None, reason="No torch CUDA devices available, or WFL_TORCH_N_GPUS isn't set")
+@pytest.mark.perf
+def test_pool_speedup_GPU(monkeypatch):
+    np.random.seed(5)
+
+    rng = np.random.default_rng(5)
+    ats = []
+    nconf = 60
+    at_prim = Atoms('Al', cell=[1, 1, 1], pbc=[True] * 3)
+    for _ in range(nconf):
+        ats.append(at_prim * (5, 5, 5))
+        ats[-1].rattle(rng=rng)
+
+    calc = (mace_mp, ["small-omat-0"], {"device": "cuda"})
+
+    req_n_gpus = os.environ["WFL_TORCH_N_GPUS"]
+    if len(req_n_gpus) == 0:
+        req_n_gpus = str(len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+
+    if "WFL_TORCH_N_GPUS" in os.environ:
+        monkeypatch.delenv("WFL_TORCH_N_GPUS")
+
+    t0 = time.time()
+    co = generic.calculate(ConfigSet(ats), OutputSpec(), calc, output_prefix="_auto_",
+                           autopara_info=AutoparaInfo(num_python_subprocesses=1,
+                                                      num_inputs_per_python_subprocess=30))
+    dt_1 = time.time() - t0
+
+    monkeypatch.setenv("WFL_TORCH_N_GPUS", req_n_gpus)
+
+    t0 = time.time()
+    co = generic.calculate(ConfigSet(ats), OutputSpec(), calc, output_prefix="_auto_",
+                           autopara_info=AutoparaInfo(num_python_subprocesses=2,
+                                                      num_inputs_per_python_subprocess=30))
+    dt_2 = time.time() - t0
+
+    monkeypatch.delenv("WFL_TORCH_N_GPUS")
+
+    print("time ratio", dt_2 / dt_1)
+    assert dt_2 / dt_1 < 0.75
+
 
 def test_outputspec_overwrite(tmp_path):
     with open(tmp_path / "ats.xyz", "w") as fout:
diff --git a/tests/test_clean_dir.py b/tests/test_clean_dir.py
new file mode 100644
index 00000000..71882bbd
--- /dev/null
+++ b/tests/test_clean_dir.py
@@ -0,0 +1,64 @@
+from wfl.calculators.utils import clean_rundir
+
+# def clean_rundir(rundir, keep_files, default_keep_files, calculation_succeeded):
+
+all_files = ["a", "aa", "b", "c", "d"]
+default_keep_files = ["a*", "b"]
+actual_default_keep_files = ["a", "aa", "b"]
+
+def create_files(dir):
+    for filename in all_files:
+        with open(dir / filename, "w") as fout:
+            fout.write("content\n")
+
+def check_dir(dir, files):
+    if files is None:
+        # even path doesn't exist
+        assert not dir.is_dir()
+        return
+
+    files = set(files)
+
+    # all expected files are present
+    for file in files:
+        assert (dir / file).is_file()
+    # all present files are expected
+    for file in dir.iterdir():
+        assert file.name in files
+
+def test_clean_rundir(tmp_path):
+    # keep True
+    # keep all files regardless of success
+    for succ, files in [(True, all_files), (False, all_files)]:
+        p = tmp_path / f"True_{succ}"
+        p.mkdir()
+        create_files(p)
+        clean_rundir(p, True, default_keep_files, calculation_succeeded=succ)
+        check_dir(p, files)
+
+    # keep False
+    # succeeded means keep nothing, failed means keep default
+    for succ, files in [(True, None), (False, actual_default_keep_files)]:
+        p = tmp_path / f"False_{succ}"
+        p.mkdir()
+        create_files(p)
+        clean_rundir(p, False, default_keep_files, calculation_succeeded=succ)
+        check_dir(p, files)
+
+    # keep subset of default
+    # succeeded means keep subset, failed means keep default
+    for succ, files in [(True, ["a"]), (False, actual_default_keep_files)]:
+        p = tmp_path / f"a_{succ}"
+        p.mkdir()
+        create_files(p)
+        clean_rundir(p, ["a"], default_keep_files, calculation_succeeded=succ)
+        check_dir(p, files)
+
+    # keep different set from default
+    # succeeded means keep set, failed means keep union of default and set
+    for succ, files in [(True, ["a", "c"]), (False, actual_default_keep_files + ["a", "c"])]:
+        p = tmp_path / f"ac_{succ}"
+        p.mkdir()
+        create_files(p)
+        clean_rundir(p, ["a", "c"], default_keep_files, calculation_succeeded=succ)
+        check_dir(p, files)
diff --git a/tests/test_md.py b/tests/test_md.py
index ac2385b5..15860fe3 100644
--- a/tests/test_md.py
+++ b/tests/test_md.py
@@ -20,9 +20,9 @@
 from wfl.generate.md.abort import AbortOnCollision, AbortOnLowEnergy
 
 try:
-    from wif.Langevin_BAOAB import Langevin_BAOAB
+    from ase.md.langevinbaoab import LangevinBAOAB
 except ImportError:
-    Langevin_BAOAB = None
+    LangevinBAOAB = None
 
 def select_every_10_steps_for_tests_during(at):
     return at.info.get("MD_step", 1) % 10 == 0
@@ -153,14 +153,14 @@ def test_NPT_Berendsen(cu_slab):
     assert np.allclose(atoms_traj[0].cell, atoms_traj[-1].cell * cell_f)
 
 
-@pytest.mark.skipif(Langevin_BAOAB is None, reason="No Langevin_BAOAB available")
-def test_NPT_Langevin_BAOAB(cu_slab):
+@pytest.mark.skipif(LangevinBAOAB is None, reason="No LangevinBAOAB available")
+def test_NPT_LangevinBAOAB(cu_slab):
     calc = EMT()
 
     inputs = ConfigSet(cu_slab)
     outputs = OutputSpec()
 
-    atoms_traj = md.md(inputs, outputs, calculator=calc, integrator="Langevin_BAOAB", steps=300, dt=1.0,
+    atoms_traj = md.md(inputs, outputs, calculator=calc, integrator="LangevinBAOAB", steps=300, dt=1.0,
                        temperature=500.0, temperature_tau=100/fs, pressure=0.0,
                        rng=np.random.default_rng(1))
 
@@ -176,14 +176,14 @@ def test_NPT_Langevin_BAOAB(cu_slab):
     assert np.allclose(atoms_traj[0].cell, atoms_traj[-1].cell * cell_f)
 
 
-@pytest.mark.skipif(Langevin_BAOAB is None, reason="No Langevin_BAOAB available")
-def test_NPT_Langevin_BAOAB_hydro_F(cu_slab):
+@pytest.mark.skipif(LangevinBAOAB is None, reason="No LangevinBAOAB available")
+def test_NPT_LangevinBAOAB_hydro_F(cu_slab):
     calc = EMT()
 
     inputs = ConfigSet(cu_slab)
     outputs = OutputSpec()
 
-    atoms_traj = md.md(inputs, outputs, calculator=calc, integrator="Langevin_BAOAB", steps=300, dt=1.0,
+    atoms_traj = md.md(inputs, outputs, calculator=calc, integrator="LangevinBAOAB", steps=300, dt=1.0,
                        temperature=500.0, temperature_tau=100/fs, pressure=0.0, hydrostatic=False,
                        rng=np.random.default_rng(1))
 
diff --git a/tests/test_phonopy.py b/tests/test_phonopy.py
index 7c7b3db0..9643db90 100644
--- a/tests/test_phonopy.py
+++ b/tests/test_phonopy.py
@@ -7,6 +7,10 @@
 
 from wfl.configset import ConfigSet, OutputSpec
 from wfl.generate.phonopy import phonopy
+try:
+    import phono3py
+except ImportError:
+    phono3py = None
 
 
 def test_phonopy(tmp_path):
@@ -33,6 +37,7 @@ def test_phonopy(tmp_path):
         for v in at.positions[1:]:
             assert min(np.linalg.norm(sc.positions[1:] - v, axis=1)) < 1.0e-7
 
+@pytest.mark.skipif(phono3py is None, reason="No phono3py module")
 def test_phono3py(tmp_path):
     at0 = Atoms(numbers=[29], cell = [[0, 2, 2], [2, 0, 2], [2, 2, 0]], positions = [[0, 0, 0]], pbc = [True]*3)
     at1 = Atoms(numbers=[29], cell = [[0, 1.9, 1.9], [1.9, 0, 1.9], [1.9, 1.9, 0]], positions = [[0, 0, 0]], pbc = [True]*3)
@@ -62,6 +67,7 @@ def test_phono3py(tmp_path):
     assert sum([at.info["config_type"] == "phonon_cubic_1" for at in pert]) == 13*2
 
 
+@pytest.mark.skipif(phono3py is None, reason="No phono3py module")
 def test_phono3py_same_supercell(tmp_path):
     at0 = Atoms(numbers=[29], cell = [[0, 2, 2], [2, 0, 2], [2, 2, 0]], positions = [[0, 0, 0]], pbc = [True]*3)
     at1 = Atoms(numbers=[29], cell = [[0, 1.9, 1.9], [1.9, 0, 1.9], [1.9, 1.9, 0]], positions = [[0, 0, 0]], pbc = [True]*3)
diff --git a/tests/test_remote_run.py b/tests/test_remote_run.py
index 6c68cadf..88fce875 100644
--- a/tests/test_remote_run.py
+++ b/tests/test_remote_run.py
@@ -69,7 +69,7 @@ def test_vasp_fail(tmp_path, expyre_systems, monkeypatch, remoteinfo_env):
 
 def do_vasp_fail(tmp_path, sys_name, monkeypatch, remoteinfo_env):
     ri = {'sys_name': sys_name, 'job_name': 'pytest_vasp_'+sys_name,
-          'env_vars' : ['ASE_VASP_COMMAND=NONE', 'ASE_VASP_COMMAND_GAMMA=NONE'],
+          'env_vars' : ['ASE_VASP_COMMAND=NO_VASP_FAIL', 'ASE_VASP_COMMAND_GAMMA=NO_VASP_FAIL'],
           'input_files' : ['POTCARs'],
           'resources': {'max_time': '5m', 'num_nodes': 1},
           'num_inputs_per_queued_job': 1, 'check_interval': 10}
diff --git a/wfl/__version__.py b/wfl/__version__.py
index 334b8995..a8d4557d 100644
--- a/wfl/__version__.py
+++ b/wfl/__version__.py
@@ -1 +1 @@
-__version__ = "0.3.4"
+__version__ = "0.3.5"
diff --git a/wfl/autoparallelize/pool.py b/wfl/autoparallelize/pool.py
index 7e2085ec..d964ce67 100644
--- a/wfl/autoparallelize/pool.py
+++ b/wfl/autoparallelize/pool.py
@@ -16,7 +16,25 @@
 # except ModuleNotFoundError:
 #     pass
 
+# https://docs.pytorch.org/docs/stable/notes/multiprocessing.html#poison-fork-in-multiprocessing
+# But, only use forkserver if needed because it has a lot of overhead
+try:
+    import torch
+except:
+    torch = None
+if os.environ.get("WFL_TORCH_N_GPUS") is not None:
+    if not torch:
+        raise RuntimeError(f"Got WFL_TORCH_N_GPUS '{os.environ['WFL_TORCH_N_GPUS']}' but torch module is not available")
+    try:
+        import multiprocessing
+        multiprocessing.set_start_method('forkserver')
+    except RuntimeError as exc:
+        # ignore complains about setting start method more than once
+        pass
 from multiprocessing.pool import Pool
+# to help keep track of distinct GPU for each python subprocess
+# https://stackoverflow.com/questions/53422761/distributing-jobs-evenly-across-multiple-gpus-with-multiprocessing-pool
+from multiprocessing import Queue, Manager
 
 from wfl.configset import ConfigSet
 from wfl.autoparallelize.mpipool_support import wfl_mpipool
@@ -24,7 +42,7 @@
 from .utils import grouper, items_inputs_generator, set_autopara_per_item_info
 
 
-def _wrapped_autopara_wrappable(op, iterable_arg, inherited_per_item_info, args, kwargs, item_inputs):
+def _wrapped_autopara_wrappable(op, iterable_arg, inherited_per_item_info, args, kwargs, gpu_id_queue, item_inputs):
     """Wrap an operation to be run in parallel by autoparallelize
 
     Parameters:
@@ -40,6 +58,8 @@ def _wrapped_autopara_wrappable(op, iterable_arg, inherited_per_item_info, args,
             list of positional args
         kwargs: dict
             dict of keyword args
+        gpu_id_queue: Queue
+            queue to ensure that each subprocess gets a unique GPU
         item_inputs: iterable(4-tuples)
             One or more 4-tuples. (item, item_i, label, item_rng). item is passed to function in iterable_arg,
             item_i is its number in the overall list, label is a quantity to be passed back with the
@@ -49,27 +69,39 @@ def _wrapped_autopara_wrappable(op, iterable_arg, inherited_per_item_info, args,
     -------
         list of 2-tuples containing, the output of each function call, together with the corresponding label (3rd) field of item_inputs
     """
-    for item_input in item_inputs:
-        assert len(item_input) == 4
+    if gpu_id_queue is not None:
+        gpu_id = gpu_id_queue.get()
+    else:
+        gpu_id = None
 
-    # item_inputs is iterable of 4-tuples
-    item_list = [item_input[0] for item_input in item_inputs]
-    item_i_list = [item_input[1] for item_input in item_inputs]
-    label_list = [item_input[2] for item_input in item_inputs]
-    rng_list = [item_input[3] for item_input in item_inputs]
+    try:
+        for item_input in item_inputs:
+            assert len(item_input) == 4
 
-    set_autopara_per_item_info(kwargs, op, inherited_per_item_info, rng_list, item_i_list)
+        if gpu_id is not None:
+            torch.cuda.set_device(gpu_id)
 
-    if isinstance(iterable_arg, int):
-        u_args = args[0:iterable_arg] + (item_list,) + args[iterable_arg:]
-    else:
-        u_args = args
-        if iterable_arg is not None:
-            kwargs[iterable_arg] = item_list
+        # item_inputs is iterable of 4-tuples
+        item_list = [item_input[0] for item_input in item_inputs]
+        item_i_list = [item_input[1] for item_input in item_inputs]
+        label_list = [item_input[2] for item_input in item_inputs]
+        rng_list = [item_input[3] for item_input in item_inputs]
+
+        set_autopara_per_item_info(kwargs, op, inherited_per_item_info, rng_list, item_i_list)
 
-    outputs = op(*u_args, **kwargs)
-    if outputs is None:
-        outputs = [None] * len(item_list)
+        if isinstance(iterable_arg, int):
+            u_args = args[0:iterable_arg] + (item_list,) + args[iterable_arg:]
+        else:
+            u_args = args
+            if iterable_arg is not None:
+                kwargs[iterable_arg] = item_list
+
+        outputs = op(*u_args, **kwargs)
+        if outputs is None:
+            outputs = [None] * len(item_list)
+    finally:
+        if gpu_id is not None:
+            gpu_id_queue.put(gpu_id)
     return zip(outputs, label_list)
 
 
@@ -132,7 +164,7 @@ def do_in_pool(num_python_subprocesses=None, num_inputs_per_python_subprocess=1,
                           f'always uses all MPI processes {wfl_mpipool.size}')
             if initializer[0] is not None:
                 # generate a task for each mpi process that will call initializer with positional initargs
-                _ = wfl_mpipool.map(functools.partial(_wrapped_autopara_wrappable, initializer[0], None, None, initializer[1], {}),
+                _ = wfl_mpipool.map(functools.partial(_wrapped_autopara_wrappable, initializer[0], None, None, initializer[1], {}, None),
                                     grouper(1, ((None, None, None, None) for i in range(wfl_mpipool.size))))
             pool = wfl_mpipool
         else:
@@ -142,30 +174,37 @@ def do_in_pool(num_python_subprocesses=None, num_inputs_per_python_subprocess=1,
                 initializer_args = {}
             pool = Pool(num_python_subprocesses, **initializer_args)
 
-        if wfl_mpipool:
-            map_f = pool.map
-        else:
-            map_f = pool.imap
-        results = map_f(functools.partial(_wrapped_autopara_wrappable, op, iterable_arg,
-                                          inherited_per_item_info, args, kwargs), items_inputs)
-
-        if not wfl_mpipool:
-            # only close pool if it's from multiprocessing.pool
-            pool.close()
-
-        # always loop over results to trigger lazy imap()
-        for result_group in results:
-            if outputspec is not None:
-                for at, from_input_loc in result_group:
-                    if skip_failed and at is None:
-                        continue
-                    did_no_work = False
-                    outputspec.store(at, from_input_loc)
-
-        if not wfl_mpipool:
-            # call join pool (prevent pytest-cov deadlock as per https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html)
-            # but only if it's from multiprocessing.pool
-            pool.join()
+        with Manager() as manager:
+            if os.environ.get("WFL_TORCH_N_GPUS") is not None:
+                gpu_id_queue = manager.Queue()
+                for subprocess_id in range(num_python_subprocesses):
+                    gpu_id_queue.put(subprocess_id % int(os.environ["WFL_TORCH_N_GPUS"]))
+            else:
+                gpu_id_queue = None
+            if wfl_mpipool:
+                map_f = pool.map
+            else:
+                map_f = pool.imap
+            results = map_f(functools.partial(_wrapped_autopara_wrappable, op, iterable_arg,
+                                              inherited_per_item_info, args, kwargs, gpu_id_queue), items_inputs)
+
+            if not wfl_mpipool:
+                # only close pool if it's from multiprocessing.pool
+                pool.close()
+
+            # always loop over results to trigger lazy imap()
+            for result_group in results:
+                if outputspec is not None:
+                    for at, from_input_loc in result_group:
+                        if skip_failed and at is None:
+                            continue
+                        did_no_work = False
+                        outputspec.store(at, from_input_loc)
+
+            if not wfl_mpipool:
+                # call join pool (prevent pytest-cov deadlock as per https://pytest-cov.readthedocs.io/en/latest/subprocess-support.html)
+                # but only if it's from multiprocessing.pool
+                pool.join()
 
     else:
         # do directly: still not trivial because of num_inputs_per_python_subprocess
@@ -174,7 +213,7 @@ def do_in_pool(num_python_subprocesses=None, num_inputs_per_python_subprocess=1,
         # unpickle to better reproduce the behavior of Pool.map() ?
         for items_inputs_group in items_inputs:
             result_group = _wrapped_autopara_wrappable(op, iterable_arg, inherited_per_item_info, args,
-                                                       kwargs, items_inputs_group)
+                                                       kwargs, None, items_inputs_group)
 
             if outputspec is not None:
                 for at, from_input_loc in result_group:
diff --git a/wfl/calculators/castep.py b/wfl/calculators/castep.py
index 4f61bf53..32a6a8e4 100644
--- a/wfl/calculators/castep.py
+++ b/wfl/calculators/castep.py
@@ -25,7 +25,7 @@ class Castep(WFLFileIOCalculator, ASE_Castep):
         what kind of files to keep from the run
             - True : everything kept
             - None, False : nothing kept, unless calculation fails
-            - "default"   : only ones needed for NOMAD uploads ('\*.pwo')
+            - "default"   : only ones needed for NOMAD uploads ('\*.{castep,param,cell}')
             - list(str)   : list of file globs to save
     rundir_prefix: str / Path, default 'run\_CASTEP\_'
         Run directory name prefix
diff --git a/wfl/calculators/orca/__init__.py b/wfl/calculators/orca/__init__.py
index c8075348..edfd3511 100644
--- a/wfl/calculators/orca/__init__.py
+++ b/wfl/calculators/orca/__init__.py
@@ -34,7 +34,7 @@ class ORCA(WFLFileIOCalculator, ASE_ORCA):
         what kind of files to keep from the run
             - True : everything kept
             - None, False : nothing kept, unless calculation fails
-            - "default"   : only ones needed for NOMAD uploads ('\*.pwo')
+            - "default"   : only ones needed for NOMAD uploads ('\*.{inp,out,engrad,xyz,_traj.xyz}')
             - list(str)   : list of file globs to save
     rundir_prefix: str / Path, default 'ORCA\_'
         Run directory name prefix
diff --git a/wfl/calculators/utils.py b/wfl/calculators/utils.py
index 1ca8c6b6..d4fdf2ed 100644
--- a/wfl/calculators/utils.py
+++ b/wfl/calculators/utils.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from wfl.utils.file_utils import clean_dir
 
 def clean_rundir(rundir, keep_files, default_keep_files, calculation_succeeded):
@@ -7,19 +9,29 @@ def clean_rundir(rundir, keep_files, default_keep_files, calculation_succeeded):
     ----------
     rundir: str
         path to run dir
-    keep_files: 'default' / list(str) / '*' / bool / None
+    keep_files: 'default' / iterable(file_glob) / '*' / bool / None
         files to keep, None or False for nothing, '*' or True for all
     default_keep_files: list(str)
         files to keep if keep_files == 'default' or calculation_succeeded is False
+        and keep_files is not True
     calculation_succeeded: bool
     """
-    if not calculation_succeeded:
-        clean_dir(rundir, set(default_keep_files) | set(keep_files if keep_files else []), force=False)
-    elif keep_files == 'default':
-        clean_dir(rundir, default_keep_files, force=False)
-    elif not keep_files:
-        clean_dir(rundir, False, force=False)
-    else:
-        clean_dir(rundir, keep_files, force=False)
+    if isinstance(keep_files, str):
+        if keep_files == 'default':
+            keep_files = default_keep_files
+        elif keep_files == '*':
+            keep_files = [keep_files]
+        else:
+            raise ValueError(f"str keep_files can only be 'default' or '*', not '{keep_files}'")
 
+    # now keep_files should either be True, evaluate to False, or an iterable
+
+    # calculation failed, default optionally union with keep_files
+    if not calculation_succeeded:
+        if not keep_files:
+            keep_files = default_keep_files
+        elif keep_files is not True:
+            keep_files = set(default_keep_files) | set(keep_files)
+        # else True
 
+    clean_dir(rundir, keep_files, force=False)
diff --git a/wfl/generate/neb.py b/wfl/generate/neb.py
index 1f2be063..400d4c7b 100644
--- a/wfl/generate/neb.py
+++ b/wfl/generate/neb.py
@@ -89,9 +89,10 @@ def process_step():
 
         # preliminary value
         final_status = 'unconverged'
+        converged = False
 
         try:
-            opt.run(fmax=fmax, steps=steps)
+            converged = opt.run(fmax=fmax, steps=steps)
         except Exception as exc:
             # label actual failed optimizations
             # when this happens, the atomic config somehow ends up with a 6-vector stress, which can't be
@@ -109,7 +110,9 @@ def process_step():
         for at in traj[0]:
             at.info['neb_config_type'] = 'neb_initial'
 
-        if opt.converged():
+        # from value returned by Optimizer.run(), since Optimizer.converged requires
+        # a gradient as of ASE 3.26
+        if converged:
             final_status = 'converged'
 
         for intermed_images in traj[1:-1]:
diff --git a/wfl/generate/optimize.py b/wfl/generate/optimize.py
index 6af51889..ef150eb6 100644
--- a/wfl/generate/optimize.py
+++ b/wfl/generate/optimize.py
@@ -159,9 +159,10 @@ def process_step():
 
         # preliminary value
         final_status = 'unconverged'
+        converged = False
 
         try:
-            opt.run(fmax=fmax, smax=smax, steps=steps)
+            converged = opt.run(fmax=fmax, smax=smax, steps=steps)
         except Exception as exc:
             # label actual failed optimizations
             # when this happens, the atomic config somehow ends up with a 6-vector stress, which can't be
@@ -182,7 +183,10 @@ def process_step():
         # set for first config, to be overwritten if it's also last config
         traj[0].info['optimize_config_type'] = 'optimize_initial'
 
-        if opt.converged():
+        # as of 3.26 converged() requires a gradient, but for PreconLBFGS it's not used
+        # See https://gitlab.com/ase/ase/-/issues/1744
+        # use value returned by Optimizer.run() instead
+        if converged:
             final_status = 'converged'
 
         traj[-1].info['optimize_config_type'] = f'optimize_last_{final_status}'
diff --git a/wfl/utils/file_utils.py b/wfl/utils/file_utils.py
index 87175938..dbc0570f 100644
--- a/wfl/utils/file_utils.py
+++ b/wfl/utils/file_utils.py
@@ -9,11 +9,11 @@ def clean_dir(directory, keep_files, force=False):
     Parameters
     ----------
     directory : directory to be cleaned
-    keep_files: bool or list(filenames) or str
+    keep_files: bool or iterable(file_globs)
         What to keep in rundir when done:
-            - list(filenames) : ONLY these filenames if they exist
-            - True or "*" : everything - does nothing
-            - False or None : remove everything, or anything evaluating to False in if
+            - True : keep everything
+            - iterable : ONLY files matching these globs if they exist
+            - evaluates to False : nothing
     force : bool, default = False
         fail if directory does not exist
 
@@ -29,21 +29,19 @@ def clean_dir(directory, keep_files, force=False):
         else:
             return
 
-    # defaults
-    if keep_files is None:
-        keep_files = False
-    elif keep_files == "*":
-        keep_files = True
-    elif isinstance(keep_files, str):
-        keep_files = [keep_files]
+    if isinstance(keep_files, str):
+        # common error that will not be caught below
+        raise ValueError(f"keep_files must not be str '{keep_files}'")
 
     # operations
-    if isinstance(keep_files, bool) and keep_files:
+    if keep_files is True:
+        # bool True, keep all
         return
     elif not keep_files:
-        # lets None and anything else evaluating to False
+        # evaluates to False, keep none
         shutil.rmtree(directory)
-    elif isinstance(keep_files, (list, tuple, set)):
+    else:
+        # keep_files should be an iterable
         extended_keep_files = []
         for glob_index in keep_files:
             extended_keep_files.extend(glob(os.path.join(directory, glob_index)))
@@ -56,5 +54,3 @@ def clean_dir(directory, keep_files, force=False):
                     os.remove(abs_fn)
                 else:
                     shutil.rmtree(abs_fn)
-    else:
-        raise RuntimeError('Got unknown type or value for keep_rundir \'{}\''.format(keep_files))