allenai · dirkgr · Oct 3, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `one_in_eight` configuration for activation checkpointing
 - New tokenizer in the source instead of from huggingface
 - Improved support for GCS
+- `torch.compile()` now only compiles each block, not the whole model.
+- Support for `torch.compile()` with `dynamic=True`
+- Resetting the `torch.compile()` after every evaluation, because evaluation messes with the compiled versions
 
 
 ## [v0.5.1](https://github.com/allenai/OLMo/releases/tag/v0.5.1) - 2024-10-17

diff --git a/.../annealing/peteish1-anneal-from-1907359-50B-nowup-moremath-dclm07-fw2-se-flan-google.yaml b/.../annealing/peteish1-anneal-from-1907359-50B-nowup-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/configs/annealing/peteish13-anneal-from-476848-100B-moremath-dclm07-fw2-se-flan-google.yaml b/configs/annealing/peteish13-anneal-from-476848-100B-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/configs/annealing/peteish13-anneal-from-476848-300B-moremath-dclm07-fw2-google.yaml b/configs/annealing/peteish13-anneal-from-476848-300B-moremath-dclm07-fw2-google.yaml
diff --git a/configs/annealing/peteish13-anneal-from-476848-300B-moremath-dclm07-fw2-se-flan-google.yaml b/configs/annealing/peteish13-anneal-from-476848-300B-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/configs/annealing/peteish13-anneal-from-596057-100B-big-number-no-whammy-2-google.yaml b/configs/annealing/peteish13-anneal-from-596057-100B-big-number-no-whammy-2-google.yaml
diff --git a/configs/annealing/peteish7-anneal-from-477000-100B-moremath-dclm07-fw2-se-flan-google.yaml b/configs/annealing/peteish7-anneal-from-477000-100B-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/configs/annealing/peteish7-anneal-from-928646-300B-nowup-moremath-dclm07-fw2-google.yaml b/configs/annealing/peteish7-anneal-from-928646-300B-nowup-moremath-dclm07-fw2-google.yaml
diff --git a/.../annealing/peteish7-anneal-from-928646-300B-nowup-moremath-dclm07-fw2-se-flan-google.yaml b/.../annealing/peteish7-anneal-from-928646-300B-nowup-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-code-dclm07-fw2.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-code-dclm07-fw2.yaml
diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-mask.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-mask.yaml
diff --git a/...ing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-from4000-2xbsz.yaml b/...ing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan-from4000-2xbsz.yaml
diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan.yaml
diff --git a/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2.yaml b/configs/annealing/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2.yaml
diff --git a/.../annealing/peteish7-medlr-anneal-from-477000-100B-moremath-dclm07-fw2-se-flan-google.yaml b/.../annealing/peteish7-medlr-anneal-from-477000-100B-moremath-dclm07-fw2-se-flan-google.yaml
diff --git a/...gs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon-hard-train.yaml b/...gs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon-hard-train.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon-hard.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon-hard.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-dclm07-flan-decon.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-refine-og.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-refine-og.yaml
diff --git a/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-refine-rw.yaml b/configs/annealing/peteish7-weka-anneal-from-928646-50B-nowup-refine-rw.yaml
diff --git a/...ng/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337-google.yaml b/...ng/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed1337-google.yaml
diff --git a/...ling/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42-google.yaml b/...ling/peteish7-weka-anneal-from-928646-50B-nowup_big-number-no-whammy-3_seed42-google.yaml
diff --git a/configs/peteish1-google.yaml b/configs/peteish1-google.yaml
diff --git a/configs/peteish1-weka.yaml b/configs/peteish1-weka.yaml
@@ -84,7 +84,7 @@ save_num_unsharded_checkpoints_to_keep: -1
 load_path: null
 
 max_duration: 1ep
-global_train_batch_size: 1024
+global_train_batch_size: 512
 device_train_microbatch_size: 4
 
 precision: amp_bf16

diff --git a/configs/peteish13-google.yaml b/configs/peteish13-google.yaml
diff --git a/configs/peteish13-s3.yaml b/configs/peteish13-s3.yaml
@@ -84,7 +84,7 @@ save_num_unsharded_checkpoints_to_keep: -1
 load_path: null
 
 max_duration: 1ep
-global_train_batch_size: 1024
+global_train_batch_size: 2048
 device_train_microbatch_size: 2
 
 precision: amp_bf16

diff --git a/configs/peteish13-weka.yaml b/configs/peteish13-weka.yaml
diff --git a/configs/peteish7-google.yaml b/configs/peteish7-google.yaml
diff --git a/olmo/config.py b/olmo/config.py
@@ -696,6 +696,17 @@ class CompilerConfig(BaseConfig):
     The backend to use.
     """
 
+    dynamic: Optional[bool] = None
+    """
+    From the torch docs:
+
+    Use dynamic shape tracing. When this is True, we will up-front attempt to generate a kernel that is as dynamic
+    as possible to avoid recompilations when sizes change. This may not always work as some
+    operations/optimizations will force specialization; use TORCH_LOGS=dynamic to debug overspecialization. When
+    this is False, we will NEVER generate dynamic kernels, we will always specialize. By default (None), we
+    automatically detect if dynamism has occurred and compile a more dynamic kernel upon recompile.
+    """
+
 
 class DistributedStrategy(StrEnum):
     ddp = "ddp"

diff --git a/olmo/train.py b/olmo/train.py
@@ -1036,6 +1036,10 @@ def eval(self) -> Dict[str, Any]:
 
             del eval_batches
 
+        # Eval compiles a bunch more versions, and the result is terrible. This way we get back to zero.
+        if self.cfg.compile is not None:
+            torch.compiler.reset()
+
         return eval_metrics
 
     def check_if_cancelled(self) -> Tuple[bool, int]:

diff --git a/olmo/util.py b/olmo/util.py
@@ -432,9 +432,7 @@ def _gcs_is_retriable(exception: Exception) -> bool:
 
 
 def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool = False):
-    from google.cloud import storage as gcs
-
-    storage_client = gcs.Client()
+    storage_client = _get_gcs_client()
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
     if not save_overwrite and blob.exists():
@@ -444,9 +442,8 @@ def _gcs_upload(source: Path, bucket_name: str, key: str, save_overwrite: bool =
 
 def _gcs_file_size(bucket_name: str, key: str) -> int:
     from google.api_core.exceptions import NotFound
-    from google.cloud import storage as gcs
 
-    storage_client = gcs.Client()
+    storage_client = _get_gcs_client()
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
     try:
@@ -459,9 +456,8 @@ def _gcs_file_size(bucket_name: str, key: str) -> int:
 
 def _gcs_get_bytes_range(bucket_name: str, key: str, bytes_start: int, num_bytes: int) -> bytes:
     from google.api_core.exceptions import NotFound
-    from google.cloud import storage as gcs
 
-    storage_client = gcs.Client()
+    storage_client = _get_gcs_client()
     bucket = storage_client.bucket(bucket_name)
     blob = bucket.blob(key)
     try:

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ requires-python = ">=3.8"
 license = { file = "LICENSE" }
 dependencies = [
     "numpy<2",
-    "torch>=2.1,<2.5",
+    "torch>=2.1",
     "ai2-olmo-core==0.1.0",
     "omegaconf",
     "rich",

diff --git a/scripts/augusta/Dockerfile b/scripts/augusta/Dockerfile
@@ -0,0 +1,79 @@
+FROM --platform=linux/amd64 nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND="noninteractive"
+ENV TZ="America/Los_Angeles"
+
+# Install base tools.
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    jq \
+    language-pack-en \
+    make \
+    sudo \
+    unzip \
+    vim \
+    wget \
+    parallel \
+    iputils-ping \
+    tmux
+
+ARG BEAKER_VERSION
+RUN curl --silent \
+    --connect-timeout 5 \
+    --max-time 10 \
+    --retry 5 \
+    --retry-delay 0 \
+    --retry-max-time 40 \
+    --output beaker.tar.gz \
+    "https://beaker.org/api/v3/release/cli?os=linux&arch=amd64&version=${BEAKER_VERSION}" \
+    && tar -zxf beaker.tar.gz -C /usr/local/bin/ ./beaker \
+    && rm beaker.tar.gz
+
+# This ensures the dynamic linker (or NVIDIA's container runtime, I'm not sure)
+# puts the right NVIDIA things in the right place
+ENV NVIDIA_DRIVER_CAPABILITIES=graphics,utility,compute
+
+# Install conda. We give anyone in the users group the ability to run
+# conda commands and install packages in the base (default) environment.
+# Things installed into the default environment won't persist, but we prefer
+# convenience in this case and try to make sure the user is aware of this
+# with a message that's printed when the session starts.
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.1.0-1-Linux-x86_64.sh \
+    && echo "32d73e1bc33fda089d7cd9ef4c1be542616bd8e437d1f77afeeaf7afdb019787 Miniconda3-py310_23.1.0-1-Linux-x86_64.sh" \
+        | sha256sum --check \
+    && bash Miniconda3-py310_23.1.0-1-Linux-x86_64.sh -b -p /opt/miniconda3 \
+    && rm Miniconda3-py310_23.1.0-1-Linux-x86_64.sh
+
+ENV PATH=/opt/miniconda3/bin:/opt/miniconda3/condabin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+RUN conda install -y pytorch::pytorch==2.5.1 packaging "numpy<2"
+
+# Ensure users can modify their container environment.
+RUN echo '%users ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+
+# Install MLNX OFED user-space drivers
+# See https://docs.nvidia.com/networking/pages/releaseview.action?pageId=15049785#Howto:DeployRDMAacceleratedDockercontaineroverInfiniBandfabric.-Dockerfile
+ENV MOFED_VER 5.8-1.1.2.1
+ENV OS_VER ubuntu20.04
+ENV PLATFORM x86_64
+RUN wget --quiet https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VER}/MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    tar -xvf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz && \
+    MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}/mlnxofedinstall --basic --user-space-only --without-fw-update -q && \
+    rm -rf MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM} && \
+    rm MLNX_OFED_LINUX-${MOFED_VER}-${OS_VER}-${PLATFORM}.tgz
+
+RUN apt-get install ninja-build -y
+
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+RUN pip install --no-cache-dir --upgrade pip "setuptools<70.0.0" wheel
+# TODO, unpin setuptools when this issue in flash attention is resolved
+RUN pip install --no-cache-dir flash-attn==2.6.3 --no-build-isolation
+RUN python -c "import torch; print(torch.__version__)"
+
+RUN pip install --no-cache-dir ai2-olmo-core==0.1.0 omegaconf rich boto3 google-cloud-storage tokenizers "cached_path>=1.6.2" transformers importlib_resources py-spy wandb beaker-gantry click torchmetrics safetensors datasets scikit-learn "msgspec>=0.14.0" "smashed[remote]>=0.21.1"
+
+RUN apt-get clean
+
diff --git a/scripts/augusta/peteish1-launch.sh b/scripts/augusta/peteish1-launch.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=$1
+shift
+
+gantry run \
+  --workspace ai2/13B \
+  --task-name peteish1 \
+  --description "Peteish1" \
+  --priority urgent \
+  --preemptible \
+  --beaker-image michalg/cuda11.8-ubuntu20.04-arb \
+  --cluster ai2/augusta-google-1 \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --propagate-preemption \
+  --synchronized-start-timeout 15m \
+  --no-python \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=DIRKG_AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=DIRKG_AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --yes \
+  --timeout=-1 \
+  --allow-dirty \
+  --retries 10 \
+  -- /bin/bash -c "scripts/augusta/peteish1.sh \$BEAKER_LEADER_REPLICA_HOSTNAME \$BEAKER_REPLICA_RANK"
diff --git a/scripts/augusta/peteish1-muplr-launch.sh b/scripts/augusta/peteish1-muplr-launch.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=$1
+shift
+
+gantry run \
+  --workspace ai2/13B \
+  --task-name peteish1-muplr \
+  --description "Peteish1 muP LR" \
+  --priority high \
+  --preemptible \
+  --beaker-image michalg/cuda11.8-ubuntu20.04-arb \
+  --cluster ai2/augusta-google-1 \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --propagate-preemption \
+  --synchronized-start-timeout 15m \
+  --no-python \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=DIRKG_AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=DIRKG_AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --yes \
+  --timeout=-1 \
+  --allow-dirty \
+  --retries 10 \
+  -- /bin/bash -c "scripts/augusta/peteish1-muplr.sh \$BEAKER_LEADER_REPLICA_HOSTNAME \$BEAKER_REPLICA_RANK"
diff --git a/scripts/augusta/peteish1-muplr.sh b/scripts/augusta/peteish1-muplr.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -exuo pipefail
+IFS=$'\n\t'
+
+BEAKER_LEADER_REPLICA_HOSTNAME=$1
+shift
+
+BEAKER_REPLICA_RANK=$1
+shift
+
+# augusta specific environment
+export LD_LIBRARY_PATH="/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH}"
+export NCCL_CROSS_NIC=0
+export NCCL_ALGO=Ring,Tree
+export NCCL_PROTO=Simple
+export NCCL_MIN_NCHANNELS=4
+export NCCL_P2P_NET_CHUNKSIZE=524288
+export NCCL_P2P_PCI_CHUNKSIZE=524288
+export NCCL_P2P_NVL_CHUNKSIZE=1048576
+export NCCL_FASTRAK_NUM_FLOWS=2
+export NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL=0
+export NCCL_BUFFSIZE=8388608
+export NCCL_FASTRAK_USE_SNAP=1
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export NCCL_NET_GDR_LEVEL=PIX
+export NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING=0
+export NCCL_TUNER_PLUGIN=libnccl-tuner.so
+export NCCL_TUNER_CONFIG_PATH=/var/lib/tcpxo/lib64/a3plus_tuner_config.textproto
+export NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE=/var/lib/tcpxo/lib64/a3plus_guest_config.textproto
+export NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS=600000
+export NCCL_NVLS_ENABLE=0
+export NCCL_DEBUG=WARN
+export NCCL_FASTRAK_CTRL_DEV=enp0s12
+export NCCL_FASTRAK_IFNAME=enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0
+export NCCL_SOCKET_IFNAME=enp0s12
+export NCCL_USE_SNAP=1
+export NCCL_FASTRAK_USE_LLCM=1
+export NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY=/dev/aperture_devices
+
+# Install flash-attn
+#conda install -y pytorch-cuda==12.4 packaging ninja cccl cuda-nvcc libcusolver-dev cuda-profiler-api libcusparse-dev libcublas-dev -c pytorch -c nvidia
+#pip install flash-attn==2.5.9.post1 --no-build-isolation
+pip install '.[train]'
+pip freeze
+
+# Force processes to synchronize at init_process_group
+export TORCH_DIST_INIT_BARRIER=1
+# Better error handling from Python
+export PYTHONFAULTHANDLER=1
+
+NAME=${GANTRY_TASK_NAME// /_}
+RUN_NAME=$NAME-$(date -u +"%Y%m%d_%H%M%S")
+SAVE_FOLDER=/data/$RUN_NAME
+mkdir -p $SAVE_FOLDER
+
+torchrun \
+  --nnodes "${BEAKER_REPLICA_COUNT}:${BEAKER_REPLICA_COUNT}" \
+  --nproc-per-node 8 \
+  --rdzv_id 12348 \
+  --rdzv_backend static \
+  --rdzv_endpoint "${BEAKER_LEADER_REPLICA_HOSTNAME}:29400" \
+  --node_rank "${BEAKER_REPLICA_RANK}" \
+  --rdzv_conf 'read_timeout=420' \
+  scripts/train.py \
+    configs/peteish1-google.yaml \
+      --run_name=$RUN_NAME \
+      --wandb.group=$NAME \
+      --optimizer.learning_rate=7.81e-3 \
+      --save_interval_ephemeral=10000 \
+      --eval_interval=10000 \
+      --fsdp.sharding_strategy=HYBRID_SHARD \
+      --fsdp.hybrid_sharding_num_model_replicas="${BEAKER_REPLICA_COUNT}" \
+      --fsdp.wrapping_strategy=by_block_and_size \
+      --save_folder=$SAVE_FOLDER \
+      --remote_save_folder="gs://ai2-llm/checkpoints/OLMo-medium/$NAME/" \
+      --try_load_latest_save \
+      --save_overwrite \
+      --sharded_checkpointer=olmo_core \
+      --device_train_microbatch_size=4 \
+      --device_eval_batch_size=8 \
+      --compile.fullgraph=false \
+      --fused_loss=false \
+      --model.flash_attention=false \
+      --data.num_workers=32 \
+      --optimizer.metrics_log_interval=10 \
+      --data.prefetch_factor=8
diff --git a/scripts/augusta/peteish1-seed-anneal-launch.sh b/scripts/augusta/peteish1-seed-anneal-launch.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+set -ex
+
+NUM_NODES=$1
+shift
+
+NAME=$1
+shift
+
+SEED=$1
+shift
+
+gantry run \
+  --workspace ai2/13B \
+  --task-name $NAME \
+  --description "Peteish1 annealing : $NAME with seed $SEED" \
+  --priority urgent \
+  --preemptible \
+  --beaker-image dirkg/OLMo \
+  --cluster ai2/augusta-google-1 \
+  --gpus 8 \
+  --replicas "${NUM_NODES}" \
+  --leader-selection \
+  --host-networking \
+  --budget ai2/oe-training \
+  --no-nfs \
+  --propagate-failure \
+  --propagate-preemption \
+  --synchronized-start-timeout 15m \
+  --no-python \
+  --env LOG_FILTER_TYPE=local_rank0_only \
+  --env OMP_NUM_THREADS=8 \
+  --env OLMO_TASK=model \
+  --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=DIRKG_AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=DIRKG_AWS_SECRET_ACCESS_KEY \
+  --shared-memory 10GiB \
+  --yes \
+  --timeout=-1 \
+  -- /bin/bash -c "scripts/augusta/peteish1-seed-anneal.sh \$BEAKER_LEADER_REPLICA_HOSTNAME \$BEAKER_REPLICA_RANK $SEED"