From eca90924ee6ff7259bf46a13c81e84adb0ea9055 Mon Sep 17 00:00:00 2001 From: James Kunstle Date: Fri, 10 Jan 2025 16:36:05 -0800 Subject: [PATCH 1/2] updates /tests folder layout, adds test matrix and smoke test Test groups are divided into three categories: 1) unit tests 2) smoke tests 3) benchmark tests They each have a dedicated tox entrypoint. Adds outer product of [FSDP, DeepSpeed] x [CPU offload, Not] test matrix. DEEPSPEED TESTS ARE BROKEN IN THIS COMMIT and are marked xFail- to be fixed in another, later commit. Signed-off-by: James Kunstle --- pyproject.toml | 5 + requirements-dev.txt | 1 + tests/test_smoke/test_train.py | 228 +++++++++++++++++++++++++++++ tests/{ => test_unit}/test_init.py | 1 - tox.ini | 23 ++- 5 files changed, 256 insertions(+), 2 deletions(-) create mode 100644 tests/test_smoke/test_train.py rename tests/{ => test_unit}/test_init.py (77%) diff --git a/pyproject.toml b/pyproject.toml index ca053385..f56cfc64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,3 +107,8 @@ exclude = [ ] # honor excludes by not following there through imports follow_imports = "silent" + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index f77c807f..fcb76fbb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,3 +13,4 @@ ipython ipykernel jupyter +huggingface_hub diff --git a/tests/test_smoke/test_train.py b/tests/test_smoke/test_train.py new file mode 100644 index 00000000..fdc55293 --- /dev/null +++ b/tests/test_smoke/test_train.py @@ -0,0 +1,228 @@ +# Standard +import os +import pathlib +import shutil +import sys +import tempfile + +# Third Party +from transformers import AutoModelForCausalLM +import huggingface_hub +import pytest + +# First Party +from instructlab.training import data_process +from instructlab.training.config import ( + DataProcessArgs, + DistributedBackend, + TorchrunArgs, + TrainingArgs, +) +from instructlab.training.main_ds import run_training + +MINIMAL_TRAINING_ARGS = { + "max_seq_len": 140, # this config fits nicely on 4xL40s and may need modification for other setups + "max_batch_len": 15000, + "num_epochs": 1, + "effective_batch_size": 3840, + "save_samples": 0, + "learning_rate": 1e-4, + "warmup_steps": 1, + "random_seed": 43, + "use_dolomite": False, + "is_padding_free": False, + "checkpoint_at_epoch": True, + "accelerate_full_state_at_epoch": True, + "process_data": False, # expect that incoming data has already been prepared and cached. + "disable_flash_attn": False, +} + +DEFAULT_TORCHRUN_ARGS = { + "nproc_per_node": 4, # TODO: this is runner-specific. Should parametrize from environment. + "nnodes": 1, + "node_rank": 0, + "rdzv_id": 123, + "rdzv_endpoint": "127.0.0.1:12345", +} + +REFERENCE_TEST_MODEL = "instructlab/granite-7b-lab" +RUNNER_CPUS_EXPECTED = 4 + +# matrix of training environments we'd like to test +DIST_BACKEND_FRAMEWORKS = ["fsdp", "deepspeed"] +USE_DOLOMITE = [True, False] +CPU_OFFLOADING = [True, False] +USE_LORA = [True, False] + + +@pytest.fixture(scope="module") +def custom_tmp_path(): + temp_dir = tempfile.mkdtemp() + + temp_path = pathlib.Path(temp_dir) + + yield temp_path + + shutil.rmtree(temp_path) + + +@pytest.fixture(scope="function") +def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: + """ + Creates a 'checkpoints' directory for each test and deletes it afterward. + """ + ckpt_dir = custom_tmp_path / "checkpoints" + ckpt_dir.mkdir() + + yield ckpt_dir + + shutil.rmtree(ckpt_dir) + + +@pytest.fixture(scope="module") +def prepared_data_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: + data_file_dir = custom_tmp_path / "prepared_data" + data_file_dir.mkdir() + + return data_file_dir + + +@pytest.fixture(scope="module") +def cached_model_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: + model_dir = custom_tmp_path / "model" + model_dir.mkdir() + return model_dir + + +@pytest.fixture(scope="module") +def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path: + """ + Downloads test model artifacts to temporary cache from HF repo. + Assumes that the artifacts for the tokenizer are in the same repo. + + Some interesting behavior: + (1) if model is already cached in $HF_HOME/hub/ the parameter blobs + will be copied into the specified `local_dir`. If some remote + files (like paper.pdf or tokenizer.config) aren't in the HF_HOME + cache, they'll be pulled and stored in the `local_dir` cache. + (2) if model is NOT already cached in $HF_HOME/hub/, a reference will + still be created to it but the downloaded artifacts will not be copied + back to the HF_HOME cache from the `local_dir`. + """ + + huggingface_hub.snapshot_download( + token=os.getenv("HF_TOKEN", None), + repo_id=REFERENCE_TEST_MODEL, + local_dir=cached_model_dir, + ) + + return cached_model_dir + + +def this_file_path() -> pathlib.Path: + return pathlib.Path(__file__).resolve() + + +def data_in_repo_path() -> pathlib.Path: + current_file_path = this_file_path() + data_in_repo_path = ( + current_file_path.parents[2] / "sample-data" / "train_all_pruned_SDG.jsonl" + ) + return data_in_repo_path + + +def chat_template_in_repo_path() -> pathlib.Path: + current_file_path = this_file_path() + chat_template_path = ( + current_file_path.parents[2] + / "src" + / "instructlab" + / "training" + / "chat_templates" + / "ibm_generic_tmpl.py" + ) + return chat_template_path + + +# TODO: This uses our data preprocessing utility which is not, itself, well tested. +# need to write tests for this as well. +@pytest.fixture(scope="module") +def cached_training_data( + prepared_data_dir: pathlib.Path, cached_test_model: pathlib.Path +) -> pathlib.Path: + """Renders test data in model template, tokenizes, and saves to fs""" + + data_in_repo = data_in_repo_path() + chat_template = chat_template_in_repo_path() + + data_process_args = DataProcessArgs( + data_output_path=str(prepared_data_dir), + data_path=str(data_in_repo), + max_seq_len=MINIMAL_TRAINING_ARGS["max_seq_len"], + model_path=str(cached_test_model), + chat_tmpl_path=str(chat_template), + num_cpu_procs=RUNNER_CPUS_EXPECTED, + ) + + data_process.main(data_process_args) + + return prepared_data_dir / "data.jsonl" + + +@pytest.mark.skip +@pytest.mark.slow +def test_basic_training_run( + cached_test_model: pathlib.Path, + cached_training_data: pathlib.Path, + checkpoint_dir: pathlib.Path, + prepared_data_dir: pathlib.Path, +) -> None: + """ + Used for isolated test development. Skipped when not in use. + """ + + train_args = TrainingArgs( + model_path=str(cached_test_model), + data_path=str(cached_training_data), + data_output_dir=str(prepared_data_dir), + ckpt_output_dir=str(checkpoint_dir), + **MINIMAL_TRAINING_ARGS, + ) + + torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS) + + run_training(torch_args=torch_args, train_args=train_args) + assert True + + +@pytest.mark.slow +@pytest.mark.parametrize("dist_backend", DIST_BACKEND_FRAMEWORKS) +@pytest.mark.parametrize("cpu_offload", CPU_OFFLOADING) +def test_training_feature_matrix( + cached_test_model: pathlib.Path, + cached_training_data: pathlib.Path, + checkpoint_dir: pathlib.Path, + prepared_data_dir: pathlib.Path, + cpu_offload: bool, + dist_backend: str, +) -> None: + train_args = TrainingArgs( + model_path=str(cached_test_model), + data_path=str(cached_training_data), + data_output_dir=str(prepared_data_dir), + ckpt_output_dir=str(checkpoint_dir), + **MINIMAL_TRAINING_ARGS, + ) + + train_args.distributed_backend = DistributedBackend(dist_backend) + if DistributedBackend.FSDP.value == dist_backend: + train_args.fsdp_options.cpu_offload_params = cpu_offload + else: + pytest.xfail("DeepSpeed not currently functional. OOMs during backprop.") + if cpu_offload: + pytest.xfail("DeepSpeed CPU Adam isn't currently building correctly") + train_args.deepspeed_options.cpu_offload_optimizer = cpu_offload + + torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS) + + run_training(torch_args=torch_args, train_args=train_args) diff --git a/tests/test_init.py b/tests/test_unit/test_init.py similarity index 77% rename from tests/test_init.py rename to tests/test_unit/test_init.py index b361b9ea..3212c37e 100644 --- a/tests/test_init.py +++ b/tests/test_unit/test_init.py @@ -2,6 +2,5 @@ import pytest -@pytest.mark.fast def test_fake(): assert True diff --git a/tox.ini b/tox.ini index 86dca1ce..f9ee24c4 100644 --- a/tox.ini +++ b/tox.ini @@ -19,7 +19,15 @@ basepython = python3.11 [testenv:py3-unit] description = run unit tests with pytest -commands = {envpython} -m pytest tests {posargs} +passenv = + HF_HOME +deps = + pytest + pytest-asyncio + pytest-cov + pytest-html + -r requirements-dev.txt +commands = {envpython} -m pytest tests/test_unit {posargs} # NOTE: {posargs} is a placeholder for input positional arguments # such as `tox -e py3-unit -- --pdb` if we wanted to run pytest with pdb enabled. # `--` delimits flags that are meant for tox vs. those that are positional arguments for @@ -27,6 +35,19 @@ commands = {envpython} -m pytest tests {posargs} # format, check, and linting targets don't build and install the project to # speed up testing. +[testenv:py3-smoke] +description = run accelerated smoke tests with pytest +passenv = + HF_HOME +deps = + pytest + pytest-asyncio + pytest-cov + pytest-html + -r requirements-dev.txt + -r requirements-cuda.txt +commands = {envpython} -m pytest tests/test_smoke {posargs} + [testenv:lint] description = lint with pylint basepython = {[testenv:py3]basepython} From b95099bde8347f966ca456027cd2dc9edff843e7 Mon Sep 17 00:00:00 2001 From: James Kunstle Date: Fri, 24 Jan 2025 18:24:10 -0800 Subject: [PATCH 2/2] adds smoke test workflow users can dispatch a workflow that runs smoke tests against a selected branch Signed-off-by: James Kunstle --- .github/workflows/e2e-nvidia-l4-x1.yml | 19 +-- .github/workflows/smoke.yaml | 146 ++++++++++++++++++ .../workflows/{unit-tests.yaml => unit.yaml} | 0 tests/{test_smoke => smoke}/test_train.py | 121 ++++++++------- tests/{test_unit => unit}/test_init.py | 0 tox.ini | 6 +- 6 files changed, 227 insertions(+), 65 deletions(-) create mode 100644 .github/workflows/smoke.yaml rename .github/workflows/{unit-tests.yaml => unit.yaml} (100%) rename tests/{test_smoke => smoke}/test_train.py (65%) rename tests/{test_unit => unit}/test_init.py (100%) diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 5ff9de55..ef4d9afb 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -15,10 +15,11 @@ on: - release-* paths: # note this should match the merging criteria in 'mergify.yml' - - '**.py' - - 'pyproject.toml' - - 'requirements**.txt' - - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow + - "**.py" + - "pyproject.toml" + - "requirements**.txt" + - ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow + - "!tests/**" # we don't need to run e2e if we're just changing the tests. concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -72,7 +73,7 @@ jobs: {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} ] - + e2e-medium-test: needs: - start-medium-ec2-runner @@ -153,7 +154,7 @@ jobs: . venv/bin/activate # set preserve to true so we can retain the logs ./scripts/e2e-ci.sh -mp - + # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library. # Therefore we must disable the upload of the training logs, as they will not exist in the same location. # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python @@ -200,7 +201,7 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-medium-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - + # - name: Download loss data # id: download-logs # uses: actions/download-artifact@v4 @@ -211,12 +212,12 @@ jobs: # - name: Install dependencies # run: | # pip install -r requirements-dev.txt - + # - name: Try to upload to s3 # id: upload-s3 # continue-on-error: true # run: | - # output_file='./test.md' + # output_file='./test.md' # python scripts/create-loss-graph.py \ # --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ # --output-file "${output_file}" \ diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml new file mode 100644 index 00000000..a8d39210 --- /dev/null +++ b/.github/workflows/smoke.yaml @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: "Run smoke tests via Tox::pytest" +# These tests will be long running and require accelerated hardware. + +on: + workflow_dispatch: + inputs: + branch: + type: string + default: main + # using this rather than pull_request because this workflow + # needs to run in the context of the base branch (main) and + # access the repo's secrets to start the AWS instances. + pull_request_target: + branches: + - main + - release-* + +permissions: + contents: read + +defaults: + run: + shell: bash + +env: + ec2_runner_variant: "g6e.12xlarge" # 4x L40s + +jobs: + start-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}} + + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Configure AWS credentials" + uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: "Start EC2 runner" + id: start-ec2-runner + uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ${{ vars.AWS_EC2_AMI }} + ec2-instance-type: ${{ env.ec2_runner_variant }} + subnet-id: subnet-024298cefa3bedd61 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + ] + + run-smoke-tests: + needs: + - start-ec2-runner + runs-on: ${{needs.start-ec2-runner.outputs.label}} + # It is important that this job has no write permissions and has + # no access to any secrets. This part is where we are running + # untrusted code from PRs. + permissions: {} + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Install packages" + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel + + - name: "Verify cuda environment is setup" + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64" + export PATH="${PATH}:${CUDA_HOME}/bin" + nvidia-smi + + - name: "Checkout code" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + ref: ${{inputs.branch}} + + # installs in $GITHUB_WORKSPACE/venv. + # only has to install Tox because Tox will do the other virtual environment management. + - name: "Setup Python virtual environment" + run: | + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + pip install tox + + - name: "Show disk utilization BEFORE tests" + run: | + df -h + + - name: "Run smoke tests with Tox and Pytest" + run: | + source venv/bin/activate + tox -e py3-smoke + + - name: "Show disk utilization AFTER tests" + run: | + df -h + + stop-ec2-runner: + needs: + - start-ec2-runner + - run-smoke-tests + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Configure AWS credentials" + uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: "Stop EC2 runner" + uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit.yaml similarity index 100% rename from .github/workflows/unit-tests.yaml rename to .github/workflows/unit.yaml diff --git a/tests/test_smoke/test_train.py b/tests/smoke/test_train.py similarity index 65% rename from tests/test_smoke/test_train.py rename to tests/smoke/test_train.py index fdc55293..d7e81cc9 100644 --- a/tests/test_smoke/test_train.py +++ b/tests/smoke/test_train.py @@ -1,4 +1,5 @@ # Standard +from typing import Generator import os import pathlib import shutil @@ -48,15 +49,16 @@ REFERENCE_TEST_MODEL = "instructlab/granite-7b-lab" RUNNER_CPUS_EXPECTED = 4 -# matrix of training environments we'd like to test -DIST_BACKEND_FRAMEWORKS = ["fsdp", "deepspeed"] -USE_DOLOMITE = [True, False] -CPU_OFFLOADING = [True, False] -USE_LORA = [True, False] - @pytest.fixture(scope="module") -def custom_tmp_path(): +def custom_tmp_dir() -> Generator[pathlib.Path, None, None]: + """A custom fixture for a temporary directory. + By default, `tmp_dir` builtin fixture is function-scoped + but we can reuse the same cached storage between many tests. + + Yields: + Generator[pathlib.Path, None, None]: path to root directory of temp storage. + """ temp_dir = tempfile.mkdtemp() temp_path = pathlib.Path(temp_dir) @@ -67,11 +69,15 @@ def custom_tmp_path(): @pytest.fixture(scope="function") -def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: +def checkpoint_dir( + custom_tmp_dir: pathlib.Path, +) -> Generator[pathlib.Path, None, None]: """ - Creates a 'checkpoints' directory for each test and deletes it afterward. + Creates a 'checkpoints' directory. + This directory must be function-scoped because each test + will create its own checkpoints. """ - ckpt_dir = custom_tmp_path / "checkpoints" + ckpt_dir = custom_tmp_dir / "checkpoints" ckpt_dir.mkdir() yield ckpt_dir @@ -80,16 +86,32 @@ def checkpoint_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: @pytest.fixture(scope="module") -def prepared_data_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: - data_file_dir = custom_tmp_path / "prepared_data" +def prepared_data_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path: + """Sets up module-scoped temporary dir for storage of preprocessed data. + + Args: + custom_tmp_dir (pathlib.Path): root dir of temporary storage + + Returns: + pathlib.Path: path to directory where preprocessed data can be cached + """ + data_file_dir = custom_tmp_dir / "prepared_data" data_file_dir.mkdir() return data_file_dir @pytest.fixture(scope="module") -def cached_model_dir(custom_tmp_path: pathlib.Path) -> pathlib.Path: - model_dir = custom_tmp_path / "model" +def cached_model_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path: + """Sets up module-scoped temporary dir for storage of model checkpoint + + Args: + custom_tmp_dir (pathlib.Path): root dir of temporary storage + + Returns: + pathlib.Path: path to directory where model checkpoint can be cached + """ + model_dir = custom_tmp_dir / "model" model_dir.mkdir() return model_dir @@ -111,7 +133,6 @@ def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path: """ huggingface_hub.snapshot_download( - token=os.getenv("HF_TOKEN", None), repo_id=REFERENCE_TEST_MODEL, local_dir=cached_model_dir, ) @@ -120,21 +141,40 @@ def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path: def this_file_path() -> pathlib.Path: + """returns the fully qualified path to this file.""" return pathlib.Path(__file__).resolve() -def data_in_repo_path() -> pathlib.Path: +def repo_root_dir() -> pathlib.Path: + """returns the fully qualified path to the root of the repo.""" current_file_path = this_file_path() - data_in_repo_path = ( - current_file_path.parents[2] / "sample-data" / "train_all_pruned_SDG.jsonl" - ) + return current_file_path.parents[2] + + +def data_in_repo_path() -> pathlib.Path: + """The data that we'll use in these tests is stored in the repo as an artifact. + This returns a path to the `data.jsonl` file based on this file's location + in the repo. + + Returns: + pathlib.Path: Path to a `.jsonl` file for tests + """ + repo_root = repo_root_dir() + data_in_repo_path = repo_root / "sample-data" / "train_all_pruned_SDG.jsonl" return data_in_repo_path def chat_template_in_repo_path() -> pathlib.Path: - current_file_path = this_file_path() + """The chat template that we'll use in these tests is stored in the repo as an artifact. + This returns a path to the `chattemplate.py` file based on this file's location + in the repo. + + Returns: + pathlib.Path: Path to a `chat_template.py" file for tests + """ + repo_root = repo_root_dir() chat_template_path = ( - current_file_path.parents[2] + repo_root / "src" / "instructlab" / "training" @@ -169,42 +209,18 @@ def cached_training_data( return prepared_data_dir / "data.jsonl" -@pytest.mark.skip @pytest.mark.slow -def test_basic_training_run( - cached_test_model: pathlib.Path, - cached_training_data: pathlib.Path, - checkpoint_dir: pathlib.Path, - prepared_data_dir: pathlib.Path, -) -> None: - """ - Used for isolated test development. Skipped when not in use. - """ - - train_args = TrainingArgs( - model_path=str(cached_test_model), - data_path=str(cached_training_data), - data_output_dir=str(prepared_data_dir), - ckpt_output_dir=str(checkpoint_dir), - **MINIMAL_TRAINING_ARGS, - ) - - torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS) - - run_training(torch_args=torch_args, train_args=train_args) - assert True - - -@pytest.mark.slow -@pytest.mark.parametrize("dist_backend", DIST_BACKEND_FRAMEWORKS) -@pytest.mark.parametrize("cpu_offload", CPU_OFFLOADING) +@pytest.mark.parametrize( + "dist_backend", [DistributedBackend.FSDP, DistributedBackend.DEEPSPEED] +) +@pytest.mark.parametrize("cpu_offload", [True, False]) def test_training_feature_matrix( cached_test_model: pathlib.Path, cached_training_data: pathlib.Path, checkpoint_dir: pathlib.Path, prepared_data_dir: pathlib.Path, cpu_offload: bool, - dist_backend: str, + dist_backend: DistributedBackend, ) -> None: train_args = TrainingArgs( model_path=str(cached_test_model), @@ -214,8 +230,9 @@ def test_training_feature_matrix( **MINIMAL_TRAINING_ARGS, ) - train_args.distributed_backend = DistributedBackend(dist_backend) - if DistributedBackend.FSDP.value == dist_backend: + train_args.distributed_backend = dist_backend + + if dist_backend == DistributedBackend.FSDP: train_args.fsdp_options.cpu_offload_params = cpu_offload else: pytest.xfail("DeepSpeed not currently functional. OOMs during backprop.") diff --git a/tests/test_unit/test_init.py b/tests/unit/test_init.py similarity index 100% rename from tests/test_unit/test_init.py rename to tests/unit/test_init.py diff --git a/tox.ini b/tox.ini index f9ee24c4..97b98be0 100644 --- a/tox.ini +++ b/tox.ini @@ -33,8 +33,6 @@ commands = {envpython} -m pytest tests/test_unit {posargs} # `--` delimits flags that are meant for tox vs. those that are positional arguments for # the command that's being run in the environment. -# format, check, and linting targets don't build and install the project to -# speed up testing. [testenv:py3-smoke] description = run accelerated smoke tests with pytest passenv = @@ -42,12 +40,12 @@ passenv = deps = pytest pytest-asyncio - pytest-cov - pytest-html -r requirements-dev.txt -r requirements-cuda.txt commands = {envpython} -m pytest tests/test_smoke {posargs} +# format, check, and linting targets don't build and install the project to +# speed up testing. [testenv:lint] description = lint with pylint basepython = {[testenv:py3]basepython}