Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 58 additions & 22 deletions Dockerfile.tmpl
Original file line number Diff line number Diff line change
@@ -1,7 +1,52 @@
ARG BASE_IMAGE \
BASE_IMAGE_TAG

FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG}
# =============================================================================
# Stage 1: Apply --force-reinstall operations to base image
# These replace packages from the base, causing layer bloat. We squash this
# stage to eliminate the duplicate package data.
# =============================================================================
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base-reinstalls

# Combine all --force-reinstall operations into one layer before squashing.
# b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x.
# This conflict causes a number of package downgrades, which are handled in this command.
# b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate.
# b/385145217: Intel MKL numpy removed - Intel's channel only has numpy 1.26.4, but base image has
# numpy 2.0.2. Downgrading would break packages built against numpy 2.x ABI.
# b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune.
# b/415358158: Gensim removed from Colab image to upgrade scipy
# b/456239669: remove huggingface-hub pin when pytorch-lighting and transformer are compatible
# b/315753846: Unpin translate package, currently conflicts with adk 1.17.0
# b/468379293: Unpin Pandas once cuml/cudf are compatible, version 3.0 causes issues
# b/468383498: numpy will auto-upgrade to 2.4.x, which causes issues with numerous packages
# b/468367647: Unpin protobuf, version greater than v5.29.5 causes issues with numerous packages
# b/408298750: We reinstall nltk because older versions have: `AttributeError: module 'inspect' has no attribute 'formatargspec'`
RUN uv pip install --no-cache \
--index-url https://pypi.nvidia.com --extra-index-url https://pypi.org/simple/ --index-strategy unsafe-first-match \
--system --force-reinstall "cuml-cu12==25.2.1" \
"nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \
"nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \
"nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \
"nvidia-nvjitlink-cu12==12.5.82" \
&& uv pip install --no-cache --system --force-reinstall "pynvjitlink-cu12==0.5.2" \
&& uv pip install --no-cache --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12" \
&& uv pip install --no-cache --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" "google-cloud-translate==3.12.1" "numpy==2.0.2" "pandas==2.2.2" \
&& uv pip install --no-cache --system --force-reinstall "protobuf==5.29.5" \
&& uv pip install --no-cache --system --force-reinstall "nltk>=3.9.1" \
&& rm -rf /root/.cache/uv /root/.cache/pip

# =============================================================================
# Stage 2: Squash the base + reinstalls to eliminate layer bloat
# =============================================================================
FROM scratch AS clean-base
COPY --from=base-reinstalls / /

# =============================================================================
# Stage 3: Continue with cacheable operations
# These layers will be cached normally on subsequent builds
# =============================================================================
FROM clean-base

ADD kaggle_requirements.txt /kaggle_requirements.txt

Expand All @@ -12,32 +57,22 @@ RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt
RUN cat /colab_requirements.txt >> /requirements.txt
RUN cat /kaggle_requirements.txt >> /requirements.txt

# Install Kaggle packages
RUN uv pip install --system -r /requirements.txt
# TODO: GPU requirements.txt
# TODO: merge them better (override matching ones).

# Install Kaggle packages (--no-cache prevents cache buildup)
RUN uv pip install --no-cache --system -r /requirements.txt

# Install manual packages:
# b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data.
RUN uv pip uninstall --system google-cloud-bigquery-storage

# b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate.
RUN uv pip install --system --force-reinstall --prerelease=allow "kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12"

# uv cannot install this in requirements.txt without --no-build-isolation
# to avoid affecting the larger build, we'll post-install it.
RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools"
RUN uv pip install --no-cache --no-build-isolation --system "git+https://github.com/Kaggle/learntools"

# newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason
RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2"

# b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune.
# b/415358158: Gensim removed from Colab image to upgrade scipy
# b/456239669: remove huggingface-hub pin when pytorch-lighting and transformer are compatible
# b/315753846: Unpin translate package, currently conflicts with adk 1.17.0
# b/468379293: Unpin Pandas once cuml/cudf are compatible, version 3.0 causes issues
# b/468383498: numpy will auto-upgrade to 2.4.x, which causes issues with numerous packages
# b/468367647: Unpin protobuf, version greater than v5.29.5 causes issues with numerous packages
RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" "huggingface-hub==0.36.0" "google-cloud-translate==3.12.1" "numpy==2.0.2" "pandas==2.2.2"
RUN uv pip install --system --force-reinstall "protobuf==5.29.5"
RUN uv pip install --no-cache --system "tbb>=2022" "libpysal==4.9.2"

# Adding non-package dependencies:
ADD clean-layer.sh /tmp/clean-layer.sh
Expand All @@ -48,7 +83,7 @@ ARG PACKAGE_PATH=/usr/local/lib/python3.12/dist-packages

# Install GPU-specific non-pip packages.
{{ if eq .Accelerator "gpu" }}
RUN uv pip install --system "pycuda"
RUN uv pip install --no-cache --system "pycuda"
{{ end }}


Expand All @@ -72,9 +107,7 @@ RUN apt-get install -y libfreetype6-dev && \
apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing

# NLTK Project datasets
# b/408298750: We currently reinstall the package, because we get the following error:
# `AttributeError: module 'inspect' has no attribute 'formatargspec'. Did you mean: 'formatargvalues'?`
RUN uv pip install --system --force-reinstall "nltk>=3.9.1"
# Note: nltk is reinstalled in stage 1 to fix b/408298750 (formatargspec error)
RUN mkdir -p /usr/share/nltk_data && \
# NLTK Downloader no longer continues smoothly after an error, so we explicitly list
# the corpuses that work
Expand Down Expand Up @@ -168,6 +201,9 @@ ENV GIT_COMMIT=${GIT_COMMIT} \
# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date

# Final cleanup
RUN rm -rf /root/.cache/uv /root/.cache/pip /tmp/clean-layer.sh

{{ if eq .Accelerator "gpu" }}
# Add the CUDA home.
ENV CUDA_HOME=/usr/local/cuda
Expand Down
4 changes: 2 additions & 2 deletions clean-layer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
set -e
set -x

# Delete files that pip caches when installing a package.
rm -rf /root/.cache/pip/*
# Delete files that pip and uv cache when installing packages.
rm -rf /root/.cache/pip/* /root/.cache/uv/*
# Delete old downloaded archive files
apt-get autoremove -y
# Delete downloaded archive files
Expand Down
6 changes: 4 additions & 2 deletions kaggle_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ path
path.py
pdf2image
plotly-express
preprocessing
# Removed: preprocessing (unmaintained since 2017, requires nltk==3.2.4 incompatible with Python 3.11)
pudb
pyLDAvis
pycryptodome
Expand All @@ -109,7 +109,9 @@ qtconsole
ray
rgf-python
s3fs
scikit-learn
# b/302136621: Fix eli5 import for learntools
# Note: scikit-learn 1.2.2 is incompatible with numpy 2.x ABI - using 1.5.2 which supports numpy 2.x
scikit-learn==1.5.2
# Scikit-learn accelerated library for x86
scikit-learn-intelex>=2023.0.1
scikit-multilearn
Expand Down