Skip to content

Commit

Permalink
update capcity resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
Delaunay committed Mar 1, 2024
1 parent 10dfb57 commit 3366641
Show file tree
Hide file tree
Showing 9 changed files with 46 additions and 38 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ jobs:
matrix:
include:
- arch: cuda
exclude : "no-cuda"
exclude : "unsupported-cuda"
# - arch: rocm
# exclude : "no-rocm"
# exclude : "unsupported-rocm"

runs-on: [self-hosted, "${{ matrix.arch }}"]

Expand Down
2 changes: 1 addition & 1 deletion config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ rwkv:
tags:
- llm
- rnn
- no-rocm
- unsupported-rocm
plan:
method: per_gpu
argv:
Expand Down
17 changes: 8 additions & 9 deletions milabench/cli/publish.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
import re
import json
import subprocess
from contextlib import contextmanager
import multiprocessing
from dataclasses import dataclass
from urllib.parse import urlparse, ParseResult
import time
import threading
import signal
import os
import re
import signal
import subprocess
import sys
import threading
import time
from contextlib import contextmanager
from dataclasses import dataclass
from urllib.parse import ParseResult, urlparse

from coleo import Option, tooled


SLEEP = 0.01
_INIT = 0
_READY = 1
Expand Down
9 changes: 5 additions & 4 deletions milabench/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def arguments():

# Define capabilities
capabilities: Option = ""

return CommonArguments(
config,
system,
Expand All @@ -91,7 +91,7 @@ def arguments():
def get_multipack(args = None, run_name=None, overrides={}):
if args is None:
args = arguments()

override = [
o if re.match(pattern=r"[.\w]+=", string=o) else f"={o}" for o in args.override
]
Expand Down Expand Up @@ -225,13 +225,14 @@ def _get_multipack(

arch = deduce_arch()
base_defaults = get_base_defaults(
base=args.base,
arch=arch,
base=args.base,
arch=arch,
run_name=run_name
)
system_config = build_system_config(
args.system,
defaults={"system": base_defaults["_defaults"]["system"]},
gpu=True
)
overrides = merge({"*": system_config}, overrides)

Expand Down
29 changes: 22 additions & 7 deletions milabench/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import contextvars
import os
import socket

import psutil
Expand Down Expand Up @@ -174,16 +175,29 @@ def resolve_addresses(nodes):
return self


def get_gpu_capacity():
capacity = float("+inf")
def get_gpu_capacity(strict=False):
try:
capacity = 0

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

return capacity
except:
print("GPU not available, defaulting to 0 MiB")
if strict:
raise
return 0

for k, v in get_gpu_info()["gpus"].items():
capacity = min(v["memory"]["total"], capacity)

return capacity
def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def build_system_config(config_file, defaults=None):
def build_system_config(config_file, defaults=None, gpu=True):
"""Load the system configuration, verify its validity and resolve ip addresses
Notes
Expand All @@ -204,7 +218,8 @@ def build_system_config(config_file, defaults=None):

system = config.get("system", {})

if "gpu" not in system:
# capacity is only required if batch resizer is enabled
if (gpu or is_autoscale_enabled()) and not "gpu" not in system:
system["gpu"] = {"capacity": f"{int(get_gpu_capacity())} MiB"}

if system.get("sshkey") is not None:
Expand Down
12 changes: 6 additions & 6 deletions milabench/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
row["gpu_load"] = f"{load}%"
row["gpu_mem"] = f"{currm:.0f}/{totalm:.0f} MB"
row["gpu_temp"] = f"{temp}C"
Expand Down Expand Up @@ -376,9 +376,9 @@ def on_data(self, entry, data, row):
load = int(data.get("load", 0) * 100)
currm, totalm = data.get("memory", [0, 0])
temp = int(data.get("temperature", 0))
row[
f"gpu:{gpuid}"
] = f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
row[f"gpu:{gpuid}"] = (
f"{load}% load | {currm:.0f}/{totalm:.0f} MB | {temp}C"
)
else:
task = data.pop("task", "")
units = data.pop("units", "")
Expand Down
1 change: 0 additions & 1 deletion milabench/merge.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Utilities to merge dictionaries and other data structures."""


from collections import deque
from functools import reduce
from typing import Union
Expand Down
1 change: 1 addition & 0 deletions milabench/scripts/vcs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Use to retrieve GIT version info, this file cannot import milabench modules
as it is executed as part of the installation process"""

import os
import subprocess
import warnings
Expand Down
9 changes: 1 addition & 8 deletions milabench/sizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,14 @@
import numpy as np
import yaml

from .config import system_global
from .config import is_autoscale_enabled, system_global
from .validation.validation import ValidationLayer

ROOT = os.path.dirname(__file__)

default_scaling_config = os.path.join(ROOT, "..", "config", "scaling.yaml")


def is_autoscale_enabled():
return (
os.getenv("MILABENCH_SIZER_AUTO", False)
or os.getenv("MILABENCH_SIZER_MULTIPLE") is not None
)


def getenv(name, type):
value = os.getenv(name)

Expand Down

0 comments on commit 3366641

Please sign in to comment.