Skip to content

Commit b0cdf76

Browse files
committed
init: GPU Runner repository with training, inference, and deployment tooling
0 parents  commit b0cdf76

43 files changed

Lines changed: 7367 additions & 0 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ci.yml

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
# =============================================================================
2+
# GitHub Actions CI Workflow
3+
# =============================================================================
4+
# This workflow runs CPU tests on every push and pull request.
5+
# GPU tests are skipped in CI as GitHub Actions runners don't have GPUs.
6+
#
7+
# For GPU testing, use self-hosted runners with GPU support or
8+
# manual testing on GPU-enabled machines.
9+
# =============================================================================
10+
11+
name: CI
12+
13+
on:
14+
push:
15+
branches: [main, master, develop]
16+
pull_request:
17+
branches: [main, master, develop]
18+
19+
jobs:
20+
test:
21+
runs-on: ubuntu-latest
22+
23+
strategy:
24+
matrix:
25+
python-version: ['3.8', '3.9', '3.10', '3.11']
26+
fail-fast: false
27+
28+
steps:
29+
- name: Checkout code
30+
uses: actions/checkout@v4
31+
32+
- name: Set up Python ${{ matrix.python-version }}
33+
uses: actions/setup-python@v5
34+
with:
35+
python-version: ${{ matrix.python-version }}
36+
cache: 'pip'
37+
38+
- name: Install dependencies
39+
run: |
40+
python -m pip install --upgrade pip
41+
# Install CPU-only PyTorch for faster CI
42+
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
43+
pip install -r requirements.txt
44+
pip install pytest pytest-xdist
45+
46+
- name: Check Python syntax
47+
run: |
48+
python -m py_compile src/train/utils.py
49+
python -m py_compile src/train/train_single_gpu.py
50+
python -m py_compile src/train/train_dataparallel.py
51+
python -m py_compile src/train/train_ddp.py
52+
python -m py_compile src/inference/inference.py
53+
python -m py_compile src/examples/simple_dataset.py
54+
python -m py_compile src/cutile_examples/cutile_vector_add.py
55+
python -m py_compile src/cutile_examples/cutile_integration_example.py
56+
python -m py_compile scripts/multi_task_scheduler.py
57+
58+
- name: Run CPU tests
59+
run: |
60+
pytest tests/ -v --ignore=tests/test_ddp_local.py -x
61+
62+
- name: Run DDP tests (skipped without GPUs)
63+
run: |
64+
# DDP tests will be skipped automatically due to requires_cuda markers
65+
pytest tests/test_ddp_local.py -v || true
66+
67+
- name: Test imports
68+
run: |
69+
python -c "from src.train.utils import set_seed, get_device, check_cutile_available"
70+
python -c "from src.examples.simple_dataset import SimpleDataset, get_simple_model"
71+
python -c "import torch; print(f'PyTorch {torch.__version__}')"
72+
73+
lint:
74+
runs-on: ubuntu-latest
75+
76+
steps:
77+
- name: Checkout code
78+
uses: actions/checkout@v4
79+
80+
- name: Set up Python
81+
uses: actions/setup-python@v5
82+
with:
83+
python-version: '3.10'
84+
85+
- name: Install linting tools
86+
run: |
87+
pip install flake8
88+
89+
- name: Run flake8 (warnings only)
90+
run: |
91+
# Run flake8 but don't fail on issues (for now)
92+
flake8 src/ --count --select=E9,F63,F7,F82 --show-source --statistics || true
93+
flake8 src/ --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
94+
95+
docker:
96+
runs-on: ubuntu-latest
97+
98+
steps:
99+
- name: Checkout code
100+
uses: actions/checkout@v4
101+
102+
- name: Set up Docker Buildx
103+
uses: docker/setup-buildx-action@v3
104+
105+
- name: Build Docker image
106+
uses: docker/build-push-action@v5
107+
with:
108+
context: .
109+
file: docker/Dockerfile
110+
push: false
111+
tags: gpu-runner:test
112+
cache-from: type=gha
113+
cache-to: type=gha,mode=max
114+
115+
# Optional: GPU tests on self-hosted runner
116+
# Uncomment if you have a self-hosted runner with GPU
117+
#
118+
# gpu-test:
119+
# runs-on: [self-hosted, gpu]
120+
#
121+
# steps:
122+
# - name: Checkout code
123+
# uses: actions/checkout@v4
124+
#
125+
# - name: Set up Python
126+
# uses: actions/setup-python@v5
127+
# with:
128+
# python-version: '3.10'
129+
#
130+
# - name: Install dependencies
131+
# run: |
132+
# pip install -r requirements.txt
133+
# pip install pytest
134+
#
135+
# - name: Run GPU tests
136+
# run: |
137+
# nvidia-smi
138+
# pytest tests/ -v
139+
#
140+
# - name: Run DDP smoke test
141+
# run: |
142+
# torchrun --nproc_per_node=2 src/train/train_ddp.py --epochs 1

.gitignore

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
*.manifest
31+
*.spec
32+
33+
# Installer logs
34+
pip-log.txt
35+
pip-delete-this-directory.txt
36+
37+
# Unit test / coverage reports
38+
htmlcov/
39+
.tox/
40+
.nox/
41+
.coverage
42+
.coverage.*
43+
.cache
44+
nosetests.xml
45+
coverage.xml
46+
*.cover
47+
*.py,cover
48+
.hypothesis/
49+
.pytest_cache/
50+
cover/
51+
52+
# Translations
53+
*.mo
54+
*.pot
55+
56+
# Django stuff:
57+
*.log
58+
local_settings.py
59+
db.sqlite3
60+
db.sqlite3-journal
61+
62+
# Flask stuff:
63+
instance/
64+
.webassets-cache
65+
66+
# Scrapy stuff:
67+
.scrapy
68+
69+
# Sphinx documentation
70+
docs/_build/
71+
72+
# PyBuilder
73+
.pybuilder/
74+
target/
75+
76+
# Jupyter Notebook
77+
.ipynb_checkpoints
78+
79+
# IPython
80+
profile_default/
81+
ipython_config.py
82+
83+
# pyenv
84+
.python-version
85+
86+
# pipenv
87+
Pipfile.lock
88+
89+
# PEP 582
90+
__pypackages__/
91+
92+
# Celery stuff
93+
celerybeat-schedule
94+
celerybeat.pid
95+
96+
# SageMath parsed files
97+
*.sage.py
98+
99+
# Environments
100+
.env
101+
.venv
102+
env/
103+
venv/
104+
ENV/
105+
env.bak/
106+
venv.bak/
107+
108+
# Spyder project settings
109+
.spyderproject
110+
.spyproject
111+
112+
# Rope project settings
113+
.ropeproject
114+
115+
# mkdocs documentation
116+
/site
117+
118+
# mypy
119+
.mypy_cache/
120+
.dmypy.json
121+
dmypy.json
122+
123+
# Pyre type checker
124+
.pyre/
125+
126+
# pytype static type analyzer
127+
.pytype/
128+
129+
# Cython debug symbols
130+
cython_debug/
131+
132+
# VS Code
133+
.vscode/
134+
135+
# PyCharm
136+
.idea/
137+
138+
# Checkpoints and model weights
139+
checkpoints/
140+
*.pt
141+
*.pth
142+
*.ckpt
143+
*.safetensors
144+
145+
# Logs
146+
logs/
147+
*.log
148+
tensorboard/
149+
wandb/
150+
151+
# Data
152+
data/
153+
datasets/
154+
155+
# CUDA build artifacts
156+
*.cubin
157+
*.fatbin
158+
*.ptx
159+
*.o
160+
161+
# cuTile artifacts
162+
.cutile_cache/
163+
*.cutile
164+
165+
# Temporary files
166+
tmp/
167+
temp/
168+
*.tmp
169+
*.swp
170+
*.swo
171+
*~
172+
173+
# OS files
174+
.DS_Store
175+
Thumbs.db
176+
177+
# Archives
178+
*.zip
179+
*.tar.gz
180+
*.tar
181+
*.rar

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024 GPU Runner Repository Contributors
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

MANIFEST.in

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Include non-Python files in the package
2+
include LICENSE
3+
include README.md
4+
include requirements.txt
5+
include pyproject.toml
6+
7+
# Include configuration files
8+
recursive-include configs *.yaml *.json
9+
10+
# Include shell scripts
11+
recursive-include scripts *.sh *.sbatch *.yaml
12+
13+
# Include docs
14+
recursive-include docs *.md *.rst
15+
16+
# Exclude compiled files
17+
global-exclude *.pyc
18+
global-exclude *.pyo
19+
global-exclude __pycache__
20+
global-exclude *.so
21+
global-exclude .git*
22+
global-exclude .DS_Store

0 commit comments

Comments
 (0)