diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml index 5ff9de55..ef4d9afb 100644 --- a/.github/workflows/e2e-nvidia-l4-x1.yml +++ b/.github/workflows/e2e-nvidia-l4-x1.yml @@ -15,10 +15,11 @@ on: - release-* paths: # note this should match the merging criteria in 'mergify.yml' - - '**.py' - - 'pyproject.toml' - - 'requirements**.txt' - - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow + - "**.py" + - "pyproject.toml" + - "requirements**.txt" + - ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow + - "!tests/**" # we don't need to run e2e if we're just changing the tests. concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -72,7 +73,7 @@ jobs: {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} ] - + e2e-medium-test: needs: - start-medium-ec2-runner @@ -153,7 +154,7 @@ jobs: . venv/bin/activate # set preserve to true so we can retain the logs ./scripts/e2e-ci.sh -mp - + # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library. # Therefore we must disable the upload of the training logs, as they will not exist in the same location. # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python @@ -200,7 +201,7 @@ jobs: github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} label: ${{ needs.start-medium-ec2-runner.outputs.label }} ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} - + # - name: Download loss data # id: download-logs # uses: actions/download-artifact@v4 @@ -211,12 +212,12 @@ jobs: # - name: Install dependencies # run: | # pip install -r requirements-dev.txt - + # - name: Try to upload to s3 # id: upload-s3 # continue-on-error: true # run: | - # output_file='./test.md' + # output_file='./test.md' # python scripts/create-loss-graph.py \ # --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \ # --output-file "${output_file}" \ diff --git a/.github/workflows/smoke.yaml b/.github/workflows/smoke.yaml new file mode 100644 index 00000000..a8d39210 --- /dev/null +++ b/.github/workflows/smoke.yaml @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: "Run smoke tests via Tox::pytest" +# These tests will be long running and require accelerated hardware. + +on: + workflow_dispatch: + inputs: + branch: + type: string + default: main + # using this rather than pull_request because this workflow + # needs to run in the context of the base branch (main) and + # access the repo's secrets to start the AWS instances. + pull_request_target: + branches: + - main + - release-* + +permissions: + contents: read + +defaults: + run: + shell: bash + +env: + ec2_runner_variant: "g6e.12xlarge" # 4x L40s + +jobs: + start-ec2-runner: + runs-on: ubuntu-latest + outputs: + label: ${{ steps.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}} + + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Configure AWS credentials" + uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: "Start EC2 runner" + id: start-ec2-runner + uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8 + with: + mode: start + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + ec2-image-id: ${{ vars.AWS_EC2_AMI }} + ec2-instance-type: ${{ env.ec2_runner_variant }} + subnet-id: subnet-024298cefa3bedd61 + security-group-id: sg-06300447c4a5fbef3 + iam-role-name: instructlab-ci-runner + aws-resource-tags: > + [ + {"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"}, + {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, + {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, + ] + + run-smoke-tests: + needs: + - start-ec2-runner + runs-on: ${{needs.start-ec2-runner.outputs.label}} + # It is important that this job has no write permissions and has + # no access to any secrets. This part is where we are running + # untrusted code from PRs. + permissions: {} + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Install packages" + run: | + cat /etc/os-release + sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel + + - name: "Verify cuda environment is setup" + run: | + export CUDA_HOME="/usr/local/cuda" + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64" + export PATH="${PATH}:${CUDA_HOME}/bin" + nvidia-smi + + - name: "Checkout code" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + ref: ${{inputs.branch}} + + # installs in $GITHUB_WORKSPACE/venv. + # only has to install Tox because Tox will do the other virtual environment management. + - name: "Setup Python virtual environment" + run: | + python3.11 -m venv --upgrade-deps venv + . venv/bin/activate + pip install tox + + - name: "Show disk utilization BEFORE tests" + run: | + df -h + + - name: "Run smoke tests with Tox and Pytest" + run: | + source venv/bin/activate + tox -e py3-smoke + + - name: "Show disk utilization AFTER tests" + run: | + df -h + + stop-ec2-runner: + needs: + - start-ec2-runner + - run-smoke-tests + runs-on: ubuntu-latest + if: ${{ always() }} + steps: + - name: "Harden runner" + uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1 + with: + egress-policy: audit + + - name: "Configure AWS credentials" + uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ vars.AWS_REGION }} + + - name: "Stop EC2 runner" + uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8 + with: + mode: stop + github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} + label: ${{ needs.start-ec2-runner.outputs.label }} + ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit.yaml similarity index 100% rename from .github/workflows/unit-tests.yaml rename to .github/workflows/unit.yaml diff --git a/pyproject.toml b/pyproject.toml index ca053385..f56cfc64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,3 +107,8 @@ exclude = [ ] # honor excludes by not following there through imports follow_imports = "silent" + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')", +] diff --git a/requirements-dev.txt b/requirements-dev.txt index f77c807f..fcb76fbb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,3 +13,4 @@ ipython ipykernel jupyter +huggingface_hub diff --git a/tests/smoke/test_train.py b/tests/smoke/test_train.py new file mode 100644 index 00000000..d7e81cc9 --- /dev/null +++ b/tests/smoke/test_train.py @@ -0,0 +1,245 @@ +# Standard +from typing import Generator +import os +import pathlib +import shutil +import sys +import tempfile + +# Third Party +from transformers import AutoModelForCausalLM +import huggingface_hub +import pytest + +# First Party +from instructlab.training import data_process +from instructlab.training.config import ( + DataProcessArgs, + DistributedBackend, + TorchrunArgs, + TrainingArgs, +) +from instructlab.training.main_ds import run_training + +MINIMAL_TRAINING_ARGS = { + "max_seq_len": 140, # this config fits nicely on 4xL40s and may need modification for other setups + "max_batch_len": 15000, + "num_epochs": 1, + "effective_batch_size": 3840, + "save_samples": 0, + "learning_rate": 1e-4, + "warmup_steps": 1, + "random_seed": 43, + "use_dolomite": False, + "is_padding_free": False, + "checkpoint_at_epoch": True, + "accelerate_full_state_at_epoch": True, + "process_data": False, # expect that incoming data has already been prepared and cached. + "disable_flash_attn": False, +} + +DEFAULT_TORCHRUN_ARGS = { + "nproc_per_node": 4, # TODO: this is runner-specific. Should parametrize from environment. + "nnodes": 1, + "node_rank": 0, + "rdzv_id": 123, + "rdzv_endpoint": "127.0.0.1:12345", +} + +REFERENCE_TEST_MODEL = "instructlab/granite-7b-lab" +RUNNER_CPUS_EXPECTED = 4 + + +@pytest.fixture(scope="module") +def custom_tmp_dir() -> Generator[pathlib.Path, None, None]: + """A custom fixture for a temporary directory. + By default, `tmp_dir` builtin fixture is function-scoped + but we can reuse the same cached storage between many tests. + + Yields: + Generator[pathlib.Path, None, None]: path to root directory of temp storage. + """ + temp_dir = tempfile.mkdtemp() + + temp_path = pathlib.Path(temp_dir) + + yield temp_path + + shutil.rmtree(temp_path) + + +@pytest.fixture(scope="function") +def checkpoint_dir( + custom_tmp_dir: pathlib.Path, +) -> Generator[pathlib.Path, None, None]: + """ + Creates a 'checkpoints' directory. + This directory must be function-scoped because each test + will create its own checkpoints. + """ + ckpt_dir = custom_tmp_dir / "checkpoints" + ckpt_dir.mkdir() + + yield ckpt_dir + + shutil.rmtree(ckpt_dir) + + +@pytest.fixture(scope="module") +def prepared_data_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path: + """Sets up module-scoped temporary dir for storage of preprocessed data. + + Args: + custom_tmp_dir (pathlib.Path): root dir of temporary storage + + Returns: + pathlib.Path: path to directory where preprocessed data can be cached + """ + data_file_dir = custom_tmp_dir / "prepared_data" + data_file_dir.mkdir() + + return data_file_dir + + +@pytest.fixture(scope="module") +def cached_model_dir(custom_tmp_dir: pathlib.Path) -> pathlib.Path: + """Sets up module-scoped temporary dir for storage of model checkpoint + + Args: + custom_tmp_dir (pathlib.Path): root dir of temporary storage + + Returns: + pathlib.Path: path to directory where model checkpoint can be cached + """ + model_dir = custom_tmp_dir / "model" + model_dir.mkdir() + return model_dir + + +@pytest.fixture(scope="module") +def cached_test_model(cached_model_dir: pathlib.Path) -> pathlib.Path: + """ + Downloads test model artifacts to temporary cache from HF repo. + Assumes that the artifacts for the tokenizer are in the same repo. + + Some interesting behavior: + (1) if model is already cached in $HF_HOME/hub/ the parameter blobs + will be copied into the specified `local_dir`. If some remote + files (like paper.pdf or tokenizer.config) aren't in the HF_HOME + cache, they'll be pulled and stored in the `local_dir` cache. + (2) if model is NOT already cached in $HF_HOME/hub/, a reference will + still be created to it but the downloaded artifacts will not be copied + back to the HF_HOME cache from the `local_dir`. + """ + + huggingface_hub.snapshot_download( + repo_id=REFERENCE_TEST_MODEL, + local_dir=cached_model_dir, + ) + + return cached_model_dir + + +def this_file_path() -> pathlib.Path: + """returns the fully qualified path to this file.""" + return pathlib.Path(__file__).resolve() + + +def repo_root_dir() -> pathlib.Path: + """returns the fully qualified path to the root of the repo.""" + current_file_path = this_file_path() + return current_file_path.parents[2] + + +def data_in_repo_path() -> pathlib.Path: + """The data that we'll use in these tests is stored in the repo as an artifact. + This returns a path to the `data.jsonl` file based on this file's location + in the repo. + + Returns: + pathlib.Path: Path to a `.jsonl` file for tests + """ + repo_root = repo_root_dir() + data_in_repo_path = repo_root / "sample-data" / "train_all_pruned_SDG.jsonl" + return data_in_repo_path + + +def chat_template_in_repo_path() -> pathlib.Path: + """The chat template that we'll use in these tests is stored in the repo as an artifact. + This returns a path to the `chattemplate.py` file based on this file's location + in the repo. + + Returns: + pathlib.Path: Path to a `chat_template.py" file for tests + """ + repo_root = repo_root_dir() + chat_template_path = ( + repo_root + / "src" + / "instructlab" + / "training" + / "chat_templates" + / "ibm_generic_tmpl.py" + ) + return chat_template_path + + +# TODO: This uses our data preprocessing utility which is not, itself, well tested. +# need to write tests for this as well. +@pytest.fixture(scope="module") +def cached_training_data( + prepared_data_dir: pathlib.Path, cached_test_model: pathlib.Path +) -> pathlib.Path: + """Renders test data in model template, tokenizes, and saves to fs""" + + data_in_repo = data_in_repo_path() + chat_template = chat_template_in_repo_path() + + data_process_args = DataProcessArgs( + data_output_path=str(prepared_data_dir), + data_path=str(data_in_repo), + max_seq_len=MINIMAL_TRAINING_ARGS["max_seq_len"], + model_path=str(cached_test_model), + chat_tmpl_path=str(chat_template), + num_cpu_procs=RUNNER_CPUS_EXPECTED, + ) + + data_process.main(data_process_args) + + return prepared_data_dir / "data.jsonl" + + +@pytest.mark.slow +@pytest.mark.parametrize( + "dist_backend", [DistributedBackend.FSDP, DistributedBackend.DEEPSPEED] +) +@pytest.mark.parametrize("cpu_offload", [True, False]) +def test_training_feature_matrix( + cached_test_model: pathlib.Path, + cached_training_data: pathlib.Path, + checkpoint_dir: pathlib.Path, + prepared_data_dir: pathlib.Path, + cpu_offload: bool, + dist_backend: DistributedBackend, +) -> None: + train_args = TrainingArgs( + model_path=str(cached_test_model), + data_path=str(cached_training_data), + data_output_dir=str(prepared_data_dir), + ckpt_output_dir=str(checkpoint_dir), + **MINIMAL_TRAINING_ARGS, + ) + + train_args.distributed_backend = dist_backend + + if dist_backend == DistributedBackend.FSDP: + train_args.fsdp_options.cpu_offload_params = cpu_offload + else: + pytest.xfail("DeepSpeed not currently functional. OOMs during backprop.") + if cpu_offload: + pytest.xfail("DeepSpeed CPU Adam isn't currently building correctly") + train_args.deepspeed_options.cpu_offload_optimizer = cpu_offload + + torch_args = TorchrunArgs(**DEFAULT_TORCHRUN_ARGS) + + run_training(torch_args=torch_args, train_args=train_args) diff --git a/tests/test_init.py b/tests/unit/test_init.py similarity index 77% rename from tests/test_init.py rename to tests/unit/test_init.py index b361b9ea..3212c37e 100644 --- a/tests/test_init.py +++ b/tests/unit/test_init.py @@ -2,6 +2,5 @@ import pytest -@pytest.mark.fast def test_fake(): assert True diff --git a/tox.ini b/tox.ini index 86dca1ce..97b98be0 100644 --- a/tox.ini +++ b/tox.ini @@ -19,12 +19,31 @@ basepython = python3.11 [testenv:py3-unit] description = run unit tests with pytest -commands = {envpython} -m pytest tests {posargs} +passenv = + HF_HOME +deps = + pytest + pytest-asyncio + pytest-cov + pytest-html + -r requirements-dev.txt +commands = {envpython} -m pytest tests/test_unit {posargs} # NOTE: {posargs} is a placeholder for input positional arguments # such as `tox -e py3-unit -- --pdb` if we wanted to run pytest with pdb enabled. # `--` delimits flags that are meant for tox vs. those that are positional arguments for # the command that's being run in the environment. +[testenv:py3-smoke] +description = run accelerated smoke tests with pytest +passenv = + HF_HOME +deps = + pytest + pytest-asyncio + -r requirements-dev.txt + -r requirements-cuda.txt +commands = {envpython} -m pytest tests/test_smoke {posargs} + # format, check, and linting targets don't build and install the project to # speed up testing. [testenv:lint]