ARG BASE_IMAGE \ BASE_IMAGE_TAG FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} ADD kaggle_requirements.txt /kaggle_requirements.txt # Freeze existing requirements from base image for critical packages: RUN pip freeze | grep -E 'tensorflow|keras|torch|jax' > /colab_requirements.txt # Merge requirements files: RUN cat /colab_requirements.txt >> /requirements.txt RUN cat /kaggle_requirements.txt >> /requirements.txt # TODO: GPU requirements.txt # TODO: merge them better (override matching ones). # Install Kaggle packages RUN uv pip install --system -r /requirements.txt # Install manual packages: # b/183041606#comment5: the Kaggle data proxy doesn't support these APIs. If the library is missing, it falls back to using a regular BigQuery query to fetch data. RUN uv pip uninstall --system google-cloud-bigquery-storage # b/394382016: sigstore (dependency of kagglehub) requires a prerelease packages, installing separate. # b/408284143: google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1, # installed outside of kaggle_requirements.txt due to requiring an incompatibile version of protobuf. RUN uv pip install --system --force-reinstall --prerelease=allow kagglehub[pandas-datasets,hf-datasets,signing]>=0.3.12 \ google-cloud-automl==1.0.1 # uv cannot install this in requirements.txt without --no-build-isolation # to avoid affecting the larger build, we'll post-install it. RUN uv pip install --no-build-isolation --system "git+https://github.com/Kaggle/learntools" # b/408281617: Torch is adamant that it can not install cudnn 9.3.x, only 9.1.x, but Tensorflow can only support 9.3.x. # This conflict causes a number of package downgrades, which are handled in this command RUN uv pip install \ --index-url https://pypi.nvidia.com --extra-index-url https://pypi.org/simple/ --index-strategy unsafe-first-match \ --system --force-reinstall "cuml-cu12==25.2.1" \ "nvidia-cudnn-cu12==9.3.0.75" "nvidia-cublas-cu12==12.5.3.2" "nvidia-cusolver-cu12==11.6.3.83" \ "nvidia-cuda-cupti-cu12==12.5.82" "nvidia-cuda-nvrtc-cu12==12.5.82" "nvidia-cuda-runtime-cu12==12.5.82" \ "nvidia-cufft-cu12==11.2.3.61" "nvidia-curand-cu12==10.3.6.82" "nvidia-cusparse-cu12==12.5.1.3" \ "nvidia-nvjitlink-cu12==12.5.82" RUN uv pip install --system --force-reinstall "pynvjitlink-cu12==0.5.2" # newer daal4py requires tbb>=2022, but libpysal is downgrading it for some reason # b/315753846: Unpin translate package. RUN uv pip install --system "tbb>=2022" "libpysal==4.9.2" "google-cloud-translate==3.12.1" # b/385145217 Latest Colab lacks mkl numpy, install it. RUN uv pip install --system --force-reinstall -i https://software.repos.intel.com/python/pypi "numpy==1.26.4" # b/404590350: Ray and torchtune have conflicting tune cli, we will prioritize torchtune. # b/415358158: Gensim removed from Colab image to upgrade scipy RUN uv pip install --system --force-reinstall --no-deps torchtune gensim "scipy<=1.15.3" # Adding non-package dependencies: ADD clean-layer.sh /tmp/clean-layer.sh ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl ADD patches/template_conf.json /opt/kaggle/conf.json # /opt/conda/lib/python3.12/site-packages ARG PACKAGE_PATH=/usr/local/lib/python3.12/dist-packages # Install GPU-specific non-pip packages. {{ if eq .Accelerator "gpu" }} RUN uv pip install --system "pycuda" {{ end }} # Use a fixed apt-get repo to stop intermittent failures due to flaky httpredir connections, # as described by Lionel Chan at http://stackoverflow.com/a/37426929/5881346 RUN sed -i "s/httpredir.debian.org/debian.uchicago.edu/" /etc/apt/sources.list && \ apt-get update --allow-releaseinfo-change && \ # Needed by lightGBM (GPU build) # https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm apt-get install -y build-essential unzip cmake libboost-dev libboost-system-dev libboost-filesystem-dev p7zip-full && \ # b/182601974: ssh client was removed from the base image but is required for packages such as stable-baselines. apt-get install -y openssh-client && \ apt-get install -y graphviz && pip install graphviz && \ /tmp/clean-layer.sh ADD patches/keras_internal.py \ patches/keras_internal_test.py \ $PACKAGE_PATH/tensorflow_decision_forests/keras/ RUN apt-get install -y libfreetype6-dev && \ apt-get install -y libglib2.0-0 libxext6 libsm6 libxrender1 libfontconfig1 --fix-missing RUN apt-get install -y git-lfs && \ # vtk dependencies apt-get install -y libgl1-mesa-glx && \ # xvfbwrapper dependencies apt-get install -y xvfb && \ /tmp/clean-layer.sh RUN uv pip install --system --force-reinstall "nltk==3.9.1" RUN mkdir -p /usr/share/nltk_data && \ # NLTK Downloader no longer continues smoothly after an error, so we explicitly list # the corpuses that work python -m nltk.downloader -d /usr/share/nltk_data abc alpino averaged_perceptron_tagger \ basque_grammars biocreative_ppi bllip_wsj_no_aux \ book_grammars brown brown_tei cess_cat cess_esp chat80 city_database cmudict \ comtrans conll2000 conll2002 conll2007 crubadan dependency_treebank \ europarl_raw floresta gazetteers genesis gutenberg \ ieer inaugural indian jeita kimmo knbc large_grammars lin_thesaurus mac_morpho machado \ masc_tagged maxent_ne_chunker maxent_treebank_pos_tagger moses_sample movie_reviews \ mte_teip5 names nps_chat omw opinion_lexicon paradigms \ pil pl196x porter_test ppattach problem_reports product_reviews_1 product_reviews_2 propbank \ pros_cons ptb punkt punkt_tab qc reuters rslp rte sample_grammars semcor senseval sentence_polarity \ sentiwordnet shakespeare sinica_treebank smultron snowball_data spanish_grammars \ state_union stopwords subjectivity swadesh switchboard tagsets timit toolbox treebank \ twitter_samples udhr2 udhr unicode_samples universal_tagset universal_treebanks_v20 \ vader_lexicon verbnet webtext word2vec_sample wordnet wordnet_ic words ycoe # Download base easyocr models. # https://github.com/JaidedAI/EasyOCR#usage RUN mkdir -p /root/.EasyOCR/model && \ wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/latin_g2.zip" -O /root/.EasyOCR/model/latin.zip && \ unzip /root/.EasyOCR/model/latin.zip -d /root/.EasyOCR/model/ && \ rm /root/.EasyOCR/model/latin.zip && \ wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip" -O /root/.EasyOCR/model/english.zip && \ unzip /root/.EasyOCR/model/english.zip -d /root/.EasyOCR/model/ && \ rm /root/.EasyOCR/model/english.zip && \ wget --no-verbose "https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip" -O /root/.EasyOCR/model/craft_mlt_25k.zip && \ unzip /root/.EasyOCR/model/craft_mlt_25k.zip -d /root/.EasyOCR/model/ && \ rm /root/.EasyOCR/model/craft_mlt_25k.zip && \ /tmp/clean-layer.sh # Tesseract and some associated utility packages RUN apt-get install tesseract-ocr -y ENV TESSERACT_PATH=/usr/bin/tesseract \ # For Facets, we also include an empty path to include $PWD. PYTHONPATH=:$PYTHONPATH:/opt/facets/facets_overview/python/ \ # For Theano with MKL MKL_THREADING_LAYER=GNU # Temporary fixes and patches # Stop jupyter nbconvert trying to rewrite its folder hierarchy RUN mkdir -p /root/.jupyter && touch /root/.jupyter/jupyter_nbconvert_config.py && touch /root/.jupyter/migrated && \ mkdir -p /.jupyter && touch /.jupyter/jupyter_nbconvert_config.py && touch /.jupyter/migrated && \ # Make matplotlib output in Jupyter notebooks display correctly mkdir -p /etc/ipython/ && echo "c = get_config(); c.IPKernelApp.matplotlib = 'inline'" > /etc/ipython/ipython_config.py && \ /tmp/clean-layer.sh # Fix to import bq_helper library without downgrading setuptools and upgrading protobuf RUN mkdir -p ~/src && git clone https://github.com/SohierDane/BigQuery_Helper ~/src/BigQuery_Helper && \ mkdir -p ~/src/BigQuery_Helper/bq_helper && \ mv ~/src/BigQuery_Helper/bq_helper.py ~/src/BigQuery_Helper/bq_helper/__init__.py && \ mv ~/src/BigQuery_Helper/test_helper.py ~/src/BigQuery_Helper/bq_helper/ && \ sed -i 's/)/packages=["bq_helper"])/g' ~/src/BigQuery_Helper/setup.py && \ uv pip install --system -e ~/src/BigQuery_Helper "protobuf<3.21"&& \ /tmp/clean-layer.sh # install imagemagick for wand # https://docs.wand-py.org/en/latest/guide/install.html#install-imagemagick-on-debian-ubuntu RUN apt-get install libmagickwand-dev # Override default imagemagick policies ADD patches/imagemagick-policy.xml /etc/ImageMagick-6/policy.xml # Add Kaggle module resolver ADD patches/kaggle_module_resolver.py $PACKAGE_PATH/tensorflow_hub/kaggle_module_resolver.py RUN sed -i '/from tensorflow_hub import uncompressed_module_resolver/a from tensorflow_hub import kaggle_module_resolver' $PACKAGE_PATH/tensorflow_hub/config.py && \ sed -i '/_install_default_resolvers()/a \ \ registry.resolver.add_implementation(kaggle_module_resolver.KaggleFileResolver())' $PACKAGE_PATH/tensorflow_hub/config.py # Add BigQuery client proxy settings ENV PYTHONUSERBASE="/root/.local" ADD patches/kaggle_gcp.py \ patches/kaggle_secrets.py \ patches/kaggle_session.py \ patches/kaggle_web_client.py \ patches/kaggle_datasets.py \ patches/log.py \ $PACKAGE_PATH/ # Figure out why this is in a different place? # Found by doing a export PYTHONVERBOSE=1 and then running python and checking for where it looked for it. ADD patches/sitecustomize.py /usr/lib/python3.12/sitecustomize.py ARG GIT_COMMIT=unknown \ BUILD_DATE=unknown LABEL git-commit=$GIT_COMMIT \ build-date=$BUILD_DATE ENV GIT_COMMIT=${GIT_COMMIT} \ BUILD_DATE=${BUILD_DATE} # Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`. RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date {{ if eq .Accelerator "gpu" }} # Add the CUDA home. ENV CUDA_HOME=/usr/local/cuda {{ end }} ENTRYPOINT ["/usr/bin/env"]