From eca90924ee6ff7259bf46a13c81e84adb0ea9055 Mon Sep 17 00:00:00 2001
From: James Kunstle <jkunstle@redhat.com>
Date: Fri, 10 Jan 2025 16:36:05 -0800
Subject: [PATCH 1/2] updates /tests folder layout, adds test matrix and smoke
 test

Test groups are divided into three categories:
1) unit tests
2) smoke tests
3) benchmark tests

They each have a dedicated tox entrypoint.

Adds outer product of [FSDP, DeepSpeed] x [CPU offload, Not] test
matrix.

DEEPSPEED TESTS ARE BROKEN IN THIS COMMIT and are marked xFail- to be
fixed in another, later commit.

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 pyproject.toml                     |   5 +
 requirements-dev.txt               |   1 +
 tests/test_smoke/test_train.py     | 228 +++++++++++++++++++++++++++++
 tests/{ => test_unit}/test_init.py |   1 -
 tox.ini                            |  23 ++-
 5 files changed, 256 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_smoke/test_train.py
 rename tests/{ => test_unit}/test_init.py (77%)

diff --git a/pyproject.toml b/pyproject.toml
index ca053385..f56cfc64 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,3 +107,8 @@ exclude = [
 ]
 # honor excludes by not following there through imports
 follow_imports = "silent"
+
+[tool.pytest.ini_options]
+markers = [
+  "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+]
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f77c807f..fcb76fbb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -13,3 +13,4 @@ ipython
 ipykernel
 jupyter
 
+huggingface_hub
diff --git a/tests/test_smoke/test_train.py b/tests/test_smoke/test_train.py
new file mode 100644
index 00000000..fdc55293
--- /dev/null
+++ b/tests/test_smoke/test_train.py
@@ -0,0 +1,228 @@
+# Standard
+import os
+import pathlib
+import shutil
+import sys
+import tempfile
+
+# Third Party
+from transformers import AutoModelForCausalLM
+import huggingface_hub
+import pytest
+
+# First Party
+from instructlab.training import data_process
+from instructlab.training.config import (
+    DataProcessArgs,
+    DistributedBackend,
+    TorchrunArgs,
+    TrainingArgs,
+)
+from instructlab.training.main_ds import run_training
+
+MINIMAL_TRAINING_ARGS = {
+    "max_seq_len": 140,  # this config fits nicely on 4xL40s and may need modification for other setups
+    "max_batch_len": 15000,
+    "num_epochs": 1,
+    "effective_batch_size": 3840,
+    "save_samples": 0,
+    "learning_rate": 1e-4,
+    "warmup_steps": 1,
+    "random_seed": 43,
+    "use_dolomite": False,
+    "is_padding_free": False,
+    "checkpoint_at_epoch": True,
+    "accelerate_full_state_at_epoch": True,
+    "process_data": False,  # expect that incoming data has already been prepared and cached.
+    "disable_flash_attn": False,
+}
+
+DEFAULT_TORCHRUN_ARGS = {
+    "nproc_per_node": 4,  # TODO: this is runner-specific. Should parametrize from environment.
+    "nnodes": 1,
+    "node_rank": 0,
+    "rdzv_id": 123,
+    "rdzv_endpoint": "127.0.0.1:12345",
+}
+
+REFERENCE_TEST_MODEL = "instructlab/granite-7b-lab"
+RUNNER_CPUS_EXPECTED = 4
+
+# matrix of training environments we'd like to test
+DIST_BACKEND_FRAMEWORKS = ["fsdp", "deepspeed"]
+USE_DOLOMITE = [True, False]
+CPU_OFFLOADING = [True, False]
+USE_LORA = [True, False]
+
+
+@pytest.fixture(scope="module")
+def custom_tmp_path():
+    temp_dir = tempfile.mkdtemp()
+
+    temp_path = pathlib.Path(temp_dir)
+
+    yield temp_path
+
+    shutil.rmtree(temp_path)
+
+
+@pytest.fixture(scope="function")
+def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
+    """
+    Creates a 'checkpoints' directory for each test and deletes it afterward.
+    """
+    ckpt_dir = custom_tmp_path / "checkpoints"
+    ckpt_dir.mkdir()
+
+    yield ckpt_dir
+
+    shutil.rmtree(ckpt_dir)
+
+
+@pytest.fixture(scope="module")
+def prepared_data_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
+    data_file_dir = custom_tmp_path / "prepared_data"
+    data_file_dir.mkdir()
+
+    return data_file_dir
+
+
+@pytest.fixture(scope="module")
+def cached_model_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
+    model_dir = custom_tmp_path / "model"
+    model_dir.mkdir()
+    return model_dir
+
+
+@pytest.fixture(scope="module")
+def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path:
+    """
+    Downloads test model artifacts to temporary cache from HF repo.
+    Assumes that the artifacts for the tokenizer are in the same repo.
+
+    Some interesting behavior:
+    (1) if model is already cached in $HF_HOME/hub/<model> the parameter blobs
+        will be copied into the specified `local_dir`. If some remote
+        files (like paper.pdf or tokenizer.config) aren't in the HF_HOME
+        cache, they'll be pulled and stored in the `local_dir` cache.
+    (2) if model is NOT already cached in $HF_HOME/hub/<model>, a reference will
+        still be created to it but the downloaded artifacts will not be copied
+        back to the HF_HOME cache from the `local_dir`.
+    """
+
+    huggingface_hub.snapshot_download(
+        token=os.getenv("HF_TOKEN", None),
+        repo_id=REFERENCE_TEST_MODEL,
+        local_dir=cached_model_dir,
+    )
+
+    return cached_model_dir
+
+
+def this_file_path() -> pathlib.Path:
+    return pathlib.Path(__file__).resolve()
+
+
+def data_in_repo_path() -> pathlib.Path:
+    current_file_path = this_file_path()
+    data_in_repo_path = (
+        current_file_path.parents[2] / "sample-data" / "train_all_pruned_SDG.jsonl"
+    )
+    return data_in_repo_path
+
+
+def chat_template_in_repo_path() -> pathlib.Path:
+    current_file_path = this_file_path()
+    chat_template_path = (
+        current_file_path.parents[2]
+        / "src"
+        / "instructlab"
+        / "training"
+        / "chat_templates"
+        / "ibm_generic_tmpl.py"
+    )
+    return chat_template_path
+
+
+# TODO: This uses our data preprocessing utility which is not, itself, well tested.
+# need to write tests for this as well.
+@pytest.fixture(scope="module")
+def cached_training_data(
+    prepared_data_dir: pathlib.Path, cached_test_model: pathlib.Path
+) -> pathlib.Path:
+    """Renders test data in model template, tokenizes, and saves to fs"""
+
+    data_in_repo = data_in_repo_path()
+    chat_template = chat_template_in_repo_path()
+
+    data_process_args = DataProcessArgs(
+        data_output_path=str(prepared_data_dir),
+        data_path=str(data_in_repo),
+        max_seq_len=MINIMAL_TRAINING_ARGS["max_seq_len"],
+        model_path=str(cached_test_model),
+        chat_tmpl_path=str(chat_template),
+        num_cpu_procs=RUNNER_CPUS_EXPECTED,
+    )
+
+    data_process.main(data_process_args)
+
+    return prepared_data_dir / "data.jsonl"
+
+
+@pytest.mark.skip
+@pytest.mark.slow
+def test_basic_training_run(
+    cached_test_model: pathlib.Path,
+    cached_training_data: pathlib.Path,
+    checkpoint_dir: pathlib.Path,
+    prepared_data_dir: pathlib.Path,
+) -> None:
+    """
+    Used for isolated test development. Skipped when not in use.
+    """
+
+    train_args = TrainingArgs(
+        model_path=str(cached_test_model),
+        data_path=str(cached_training_data),
+        data_output_dir=str(prepared_data_dir),
+        ckpt_output_dir=str(checkpoint_dir),
+        **MINIMAL_TRAINING_ARGS,
+    )
+
+    torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS)
+
+    run_training(torch_args=torch_args, train_args=train_args)
+    assert True
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("dist_backend", DIST_BACKEND_FRAMEWORKS)
+@pytest.mark.parametrize("cpu_offload", CPU_OFFLOADING)
+def test_training_feature_matrix(
+    cached_test_model: pathlib.Path,
+    cached_training_data: pathlib.Path,
+    checkpoint_dir: pathlib.Path,
+    prepared_data_dir: pathlib.Path,
+    cpu_offload: bool,
+    dist_backend: str,
+) -> None:
+    train_args = TrainingArgs(
+        model_path=str(cached_test_model),
+        data_path=str(cached_training_data),
+        data_output_dir=str(prepared_data_dir),
+        ckpt_output_dir=str(checkpoint_dir),
+        **MINIMAL_TRAINING_ARGS,
+    )
+
+    train_args.distributed_backend = DistributedBackend(dist_backend)
+    if DistributedBackend.FSDP.value == dist_backend:
+        train_args.fsdp_options.cpu_offload_params = cpu_offload
+    else:
+        pytest.xfail("DeepSpeed not currently functional. OOMs during backprop.")
+        if cpu_offload:
+            pytest.xfail("DeepSpeed CPU Adam isn't currently building correctly")
+        train_args.deepspeed_options.cpu_offload_optimizer = cpu_offload
+
+    torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS)
+
+    run_training(torch_args=torch_args, train_args=train_args)
diff --git a/tests/test_init.py b/tests/test_unit/test_init.py
similarity index 77%
rename from tests/test_init.py
rename to tests/test_unit/test_init.py
index b361b9ea..3212c37e 100644
--- a/tests/test_init.py
+++ b/tests/test_unit/test_init.py
@@ -2,6 +2,5 @@
 import pytest
 
 
-@pytest.mark.fast
 def test_fake():
     assert True
diff --git a/tox.ini b/tox.ini
index 86dca1ce..f9ee24c4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -19,7 +19,15 @@ basepython = python3.11
 
 [testenv:py3-unit]
 description = run unit tests with pytest
-commands = {envpython} -m pytest tests {posargs}
+passenv =
+	HF_HOME
+deps = 
+    pytest
+    pytest-asyncio
+    pytest-cov
+    pytest-html
+    -r requirements-dev.txt
+commands = {envpython} -m pytest tests/test_unit {posargs}
 # NOTE: {posargs} is a placeholder for input positional arguments
 # such as `tox -e py3-unit -- --pdb` if we wanted to run pytest with pdb enabled.
 # `--` delimits flags that are meant for tox vs. those that are positional arguments for
@@ -27,6 +35,19 @@ commands = {envpython} -m pytest tests {posargs}
 
 # format, check, and linting targets don't build and install the project to
 # speed up testing.
+[testenv:py3-smoke]
+description = run accelerated smoke tests with pytest
+passenv =
+	HF_HOME
+deps = 
+    pytest
+    pytest-asyncio
+    pytest-cov
+    pytest-html
+    -r requirements-dev.txt
+    -r requirements-cuda.txt
+commands = {envpython} -m pytest tests/test_smoke {posargs}
+
 [testenv:lint]
 description = lint with pylint
 basepython = {[testenv:py3]basepython}

From b95099bde8347f966ca456027cd2dc9edff843e7 Mon Sep 17 00:00:00 2001
From: James Kunstle <jkunstle@redhat.com>
Date: Fri, 24 Jan 2025 18:24:10 -0800
Subject: [PATCH 2/2] adds smoke test workflow

users can dispatch a workflow that runs smoke tests against a selected
branch

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 .github/workflows/e2e-nvidia-l4-x1.yml        |  19 +--
 .github/workflows/smoke.yaml                  | 146 ++++++++++++++++++
 .../workflows/{unit-tests.yaml => unit.yaml}  |   0
 tests/{test_smoke => smoke}/test_train.py     | 121 ++++++++-------
 tests/{test_unit => unit}/test_init.py        |   0
 tox.ini                                       |   6 +-
 6 files changed, 227 insertions(+), 65 deletions(-)
 create mode 100644 .github/workflows/smoke.yaml
 rename .github/workflows/{unit-tests.yaml => unit.yaml} (100%)
 rename tests/{test_smoke => smoke}/test_train.py (65%)
 rename tests/{test_unit => unit}/test_init.py (100%)

diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 5ff9de55..ef4d9afb 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -15,10 +15,11 @@ on:
       - release-*
     paths:
       # note this should match the merging criteria in 'mergify.yml'
-      - '**.py'
-      - 'pyproject.toml'
-      - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
+      - "**.py"
+      - "pyproject.toml"
+      - "requirements**.txt"
+      - ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
+      - "!tests/**" # we don't need to run e2e if we're just changing the tests.
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -72,7 +73,7 @@ jobs:
               {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
             ]
-  
+
   e2e-medium-test:
     needs:
       - start-medium-ec2-runner
@@ -153,7 +154,7 @@ jobs:
           . venv/bin/activate
           # set preserve to true so we can retain the logs
           ./scripts/e2e-ci.sh -mp
-          
+
           # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
           #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
@@ -200,7 +201,7 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
-      
+
       # - name: Download loss data
       #   id: download-logs
       #   uses: actions/download-artifact@v4
@@ -211,12 +212,12 @@ jobs:
       # - name: Install dependencies
       #   run: |
       #     pip install -r requirements-dev.txt
-      
+
       # - name: Try to upload to s3
       #   id: upload-s3
       #   continue-on-error: true
       #   run: |
-      #     output_file='./test.md' 
+      #     output_file='./test.md'
       #     python scripts/create-loss-graph.py  \
       #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
       #       --output-file "${output_file}" \
diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml
new file mode 100644
index 00000000..a8d39210
--- /dev/null
+++ b/.github/workflows/smoke.yaml
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: "Run smoke tests via Tox::pytest"
+# These tests will be long running and require accelerated hardware.
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        type: string
+        default: main
+  # using this rather than pull_request because this workflow
+  # needs to run in the context of the base branch (main) and
+  # access the repo's secrets to start the AWS instances.
+  pull_request_target:
+    branches:
+      - main
+      - release-*
+
+permissions:
+  contents: read
+
+defaults:
+  run:
+    shell: bash
+
+env:
+  ec2_runner_variant: "g6e.12xlarge" # 4x L40s
+
+jobs:
+  start-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}
+
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: "Start EC2 runner"
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: ${{ env.ec2_runner_variant }}
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+            {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
+            {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+            {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+            ]
+
+  run-smoke-tests:
+    needs:
+      - start-ec2-runner
+    runs-on: ${{needs.start-ec2-runner.outputs.label}}
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Install packages"
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
+
+      - name: "Verify cuda environment is setup"
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
+          export PATH="${PATH}:${CUDA_HOME}/bin"
+          nvidia-smi
+
+      - name: "Checkout code"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{inputs.branch}}
+
+      # installs in $GITHUB_WORKSPACE/venv.
+      # only has to install Tox because Tox will do the other virtual environment management.
+      - name: "Setup Python virtual environment"
+        run: |
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          pip install tox
+
+      - name: "Show disk utilization BEFORE tests"
+        run: |
+          df -h
+
+      - name: "Run smoke tests with Tox and Pytest"
+        run: |
+          source venv/bin/activate
+          tox -e py3-smoke
+
+      - name: "Show disk utilization AFTER tests"
+        run: |
+          df -h
+
+  stop-ec2-runner:
+    needs:
+      - start-ec2-runner
+      - run-smoke-tests
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden runner"
+        uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: "Configure AWS credentials"
+        uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ vars.AWS_REGION }}
+
+      - name: "Stop EC2 runner"
+        uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit.yaml
similarity index 100%
rename from .github/workflows/unit-tests.yaml
rename to .github/workflows/unit.yaml
diff --git a/tests/test_smoke/test_train.py b/tests/smoke/test_train.py
similarity index 65%
rename from tests/test_smoke/test_train.py
rename to tests/smoke/test_train.py
index fdc55293..d7e81cc9 100644
--- a/tests/test_smoke/test_train.py
+++ b/tests/smoke/test_train.py
@@ -1,4 +1,5 @@
 # Standard
+from typing import Generator
 import os
 import pathlib
 import shutil
@@ -48,15 +49,16 @@
 REFERENCE_TEST_MODEL = "instructlab/granite-7b-lab"
 RUNNER_CPUS_EXPECTED = 4
 
-# matrix of training environments we'd like to test
-DIST_BACKEND_FRAMEWORKS = ["fsdp", "deepspeed"]
-USE_DOLOMITE = [True, False]
-CPU_OFFLOADING = [True, False]
-USE_LORA = [True, False]
-
 
 @pytest.fixture(scope="module")
-def custom_tmp_path():
+def custom_tmp_dir() -> Generator[pathlib.Path, None, None]:
+    """A custom fixture for a temporary directory.
+    By default, `tmp_dir` builtin fixture is function-scoped
+    but we can reuse the same cached storage between many tests.
+
+    Yields:
+        Generator[pathlib.Path, None, None]: path to root directory of temp storage.
+    """
     temp_dir = tempfile.mkdtemp()
 
     temp_path = pathlib.Path(temp_dir)
@@ -67,11 +69,15 @@ def custom_tmp_path():
 
 
 @pytest.fixture(scope="function")
-def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
+def checkpoint_dir(
+    custom_tmp_dir: pathlib.Path,
+) -> Generator[pathlib.Path, None, None]:
     """
-    Creates a 'checkpoints' directory for each test and deletes it afterward.
+    Creates a 'checkpoints' directory.
+    This directory must be function-scoped because each test
+    will create its own checkpoints.
     """
-    ckpt_dir = custom_tmp_path / "checkpoints"
+    ckpt_dir = custom_tmp_dir / "checkpoints"
     ckpt_dir.mkdir()
 
     yield ckpt_dir
@@ -80,16 +86,32 @@ def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
 
 
 @pytest.fixture(scope="module")
-def prepared_data_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
-    data_file_dir = custom_tmp_path / "prepared_data"
+def prepared_data_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path:
+    """Sets up module-scoped temporary dir for storage of preprocessed data.
+
+    Args:
+        custom_tmp_dir (pathlib.Path): root dir of temporary storage
+
+    Returns:
+        pathlib.Path: path to directory where preprocessed data can be cached
+    """
+    data_file_dir = custom_tmp_dir / "prepared_data"
     data_file_dir.mkdir()
 
     return data_file_dir
 
 
 @pytest.fixture(scope="module")
-def cached_model_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path:
-    model_dir = custom_tmp_path / "model"
+def cached_model_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path:
+    """Sets up module-scoped temporary dir for storage of model checkpoint
+
+    Args:
+        custom_tmp_dir (pathlib.Path): root dir of temporary storage
+
+    Returns:
+        pathlib.Path: path to directory where model checkpoint can be cached
+    """
+    model_dir = custom_tmp_dir / "model"
     model_dir.mkdir()
     return model_dir
 
@@ -111,7 +133,6 @@ def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path:
     """
 
     huggingface_hub.snapshot_download(
-        token=os.getenv("HF_TOKEN", None),
         repo_id=REFERENCE_TEST_MODEL,
         local_dir=cached_model_dir,
     )
@@ -120,21 +141,40 @@ def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path:
 
 
 def this_file_path() -> pathlib.Path:
+    """returns the fully qualified path to this file."""
     return pathlib.Path(__file__).resolve()
 
 
-def data_in_repo_path() -> pathlib.Path:
+def repo_root_dir() -> pathlib.Path:
+    """returns the fully qualified path to the root of the repo."""
     current_file_path = this_file_path()
-    data_in_repo_path = (
-        current_file_path.parents[2] / "sample-data" / "train_all_pruned_SDG.jsonl"
-    )
+    return current_file_path.parents[2]
+
+
+def data_in_repo_path() -> pathlib.Path:
+    """The data that we'll use in these tests is stored in the repo as an artifact.
+    This returns a path to the `data.jsonl` file based on this file's location
+    in the repo.
+
+    Returns:
+        pathlib.Path: Path to a `.jsonl` file for tests
+    """
+    repo_root = repo_root_dir()
+    data_in_repo_path = repo_root / "sample-data" / "train_all_pruned_SDG.jsonl"
     return data_in_repo_path
 
 
 def chat_template_in_repo_path() -> pathlib.Path:
-    current_file_path = this_file_path()
+    """The chat template that we'll use in these tests is stored in the repo as an artifact.
+    This returns a path to the `chattemplate.py` file based on this file's location
+    in the repo.
+
+    Returns:
+        pathlib.Path: Path to a `chat_template.py" file for tests
+    """
+    repo_root = repo_root_dir()
     chat_template_path = (
-        current_file_path.parents[2]
+        repo_root
         / "src"
         / "instructlab"
         / "training"
@@ -169,42 +209,18 @@ def cached_training_data(
     return prepared_data_dir / "data.jsonl"
 
 
-@pytest.mark.skip
 @pytest.mark.slow
-def test_basic_training_run(
-    cached_test_model: pathlib.Path,
-    cached_training_data: pathlib.Path,
-    checkpoint_dir: pathlib.Path,
-    prepared_data_dir: pathlib.Path,
-) -> None:
-    """
-    Used for isolated test development. Skipped when not in use.
-    """
-
-    train_args = TrainingArgs(
-        model_path=str(cached_test_model),
-        data_path=str(cached_training_data),
-        data_output_dir=str(prepared_data_dir),
-        ckpt_output_dir=str(checkpoint_dir),
-        **MINIMAL_TRAINING_ARGS,
-    )
-
-    torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS)
-
-    run_training(torch_args=torch_args, train_args=train_args)
-    assert True
-
-
-@pytest.mark.slow
-@pytest.mark.parametrize("dist_backend", DIST_BACKEND_FRAMEWORKS)
-@pytest.mark.parametrize("cpu_offload", CPU_OFFLOADING)
+@pytest.mark.parametrize(
+    "dist_backend", [DistributedBackend.FSDP, DistributedBackend.DEEPSPEED]
+)
+@pytest.mark.parametrize("cpu_offload", [True, False])
 def test_training_feature_matrix(
     cached_test_model: pathlib.Path,
     cached_training_data: pathlib.Path,
     checkpoint_dir: pathlib.Path,
     prepared_data_dir: pathlib.Path,
     cpu_offload: bool,
-    dist_backend: str,
+    dist_backend: DistributedBackend,
 ) -> None:
     train_args = TrainingArgs(
         model_path=str(cached_test_model),
@@ -214,8 +230,9 @@ def test_training_feature_matrix(
         **MINIMAL_TRAINING_ARGS,
     )
 
-    train_args.distributed_backend = DistributedBackend(dist_backend)
-    if DistributedBackend.FSDP.value == dist_backend:
+    train_args.distributed_backend = dist_backend
+
+    if dist_backend == DistributedBackend.FSDP:
         train_args.fsdp_options.cpu_offload_params = cpu_offload
     else:
         pytest.xfail("DeepSpeed not currently functional. OOMs during backprop.")
diff --git a/tests/test_unit/test_init.py b/tests/unit/test_init.py
similarity index 100%
rename from tests/test_unit/test_init.py
rename to tests/unit/test_init.py
diff --git a/tox.ini b/tox.ini
index f9ee24c4..97b98be0 100644
--- a/tox.ini
+++ b/tox.ini
@@ -33,8 +33,6 @@ commands = {envpython} -m pytest tests/test_unit {posargs}
 # `--` delimits flags that are meant for tox vs. those that are positional arguments for
 # the command that's being run in the environment.
 
-# format, check, and linting targets don't build and install the project to
-# speed up testing.
 [testenv:py3-smoke]
 description = run accelerated smoke tests with pytest
 passenv =
@@ -42,12 +40,12 @@ passenv =
 deps = 
     pytest
     pytest-asyncio
-    pytest-cov
-    pytest-html
     -r requirements-dev.txt
     -r requirements-cuda.txt
 commands = {envpython} -m pytest tests/test_smoke {posargs}
 
+# format, check, and linting targets don't build and install the project to
+# speed up testing.
 [testenv:lint]
 description = lint with pylint
 basepython = {[testenv:py3]basepython}