Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[misc] refactor launch API and tensor constructor #5666

Merged
merged 7 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/doc_test_on_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
needs: detect-changed-doc
runs-on: [self-hosted, gpu]
container:
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
options: --gpus all --rm
timeout-minutes: 20
defaults:
Expand Down
2 changes: 1 addition & 1 deletion applications/Colossal-LLaMA/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def main() -> None:
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
accelerator = get_accelerator()
coordinator = DistCoordinator()

Expand Down
2 changes: 1 addition & 1 deletion applications/ColossalChat/benchmarks/benchmark_ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def benchmark_train(args):
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()

# ======================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def train(args):
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()

# ==============================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def train(args):
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()

# ======================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def train(args):
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()

# ======================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def train(args):
# ==============================
# Initialize Distributed Training
# ==============================
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()

# ==============================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def rm_and_merge(


def main(args):
colossalai.launch_from_torch(config={}, seed=42)
colossalai.launch_from_torch(seed=42)
accelerator = get_accelerator()
world_size = dist.get_world_size()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def rm_and_merge(


def main(args):
colossalai.launch_from_torch(config={}, seed=42)
colossalai.launch_from_torch(seed=42)
world_size = dist.get_world_size()

rank = dist.get_rank()
Expand Down
8 changes: 6 additions & 2 deletions applications/ColossalMoE/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def main():
args = parse_args()

# Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed)
colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator()

config = MixtralConfig.from_pretrained(args.model_name)
Expand Down Expand Up @@ -96,7 +96,11 @@ def main():
if coordinator.rank == 0:
text = ["Hello my name is"]
else:
text = ["What's the largest country in the world?", "How many people live in China?", "帮我续写这首诗:离离原上草"]
text = [
"What's the largest country in the world?",
"How many people live in China?",
"帮我续写这首诗:离离原上草",
]
tokenizer.pad_token = tokenizer.unk_token
inputs = tokenizer(text, return_tensors="pt", padding=True).to(torch.cuda.current_device())

Expand Down
2 changes: 1 addition & 1 deletion applications/ColossalMoE/tests/test_mixtral_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_mixtral_moe_layer():


def run_dist(rank: int, world_size: int, port: int):
colossalai.launch({}, rank, world_size, "localhost", port)
colossalai.launch(rank, world_size, "localhost", port)
check_mixtral_moe_layer()


Expand Down
2 changes: 1 addition & 1 deletion applications/ColossalMoE/tests/test_moe_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def check_mixtral_moe_layer():


def run_dist(rank: int, world_size: int, port: int):
colossalai.launch({}, rank, world_size, "localhost", port)
colossalai.launch(rank, world_size, "localhost", port)
check_mixtral_moe_layer()


Expand Down
8 changes: 4 additions & 4 deletions applications/ColossalMoE/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def main():
args = parse_args()

# Launch ColossalAI
colossalai.launch_from_torch(config={}, seed=args.seed)
colossalai.launch_from_torch(seed=args.seed)
coordinator = DistCoordinator()

# Set plugin
Expand Down Expand Up @@ -195,9 +195,9 @@ def main():
lr_scheduler = CosineAnnealingWarmupLR(
optimizer=optimizer,
total_steps=args.num_epochs * len(dataloader),
warmup_steps=args.warmup_steps
if args.warmup_steps is not None
else int(args.num_epochs * len(dataloader) * 0.025),
warmup_steps=(
args.warmup_steps if args.warmup_steps is not None else int(args.num_epochs * len(dataloader) * 0.025)
),
eta_min=0.1 * args.lr,
)

Expand Down
2 changes: 1 addition & 1 deletion colossalai/auto_parallel/offload/amp_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def loss_scale(self):
return self.grad_scaler.scale.item()

def zero_grad(self, *args, **kwargs):
self.module.overflow_counter = torch.cuda.IntTensor([0])
self.module.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_accelerator().get_current_device())
return self.optim.zero_grad(set_to_none=True)

def step(self, *args, **kwargs):
Expand Down
4 changes: 2 additions & 2 deletions colossalai/auto_parallel/offload/base_offload_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch
import torch.nn as nn

from colossalai.utils import _cast_float
from colossalai.utils import _cast_float, get_current_device
from colossalai.utils.common import free_storage

from .region_manager import RegionManager
Expand All @@ -25,7 +25,7 @@ def __init__(self, model: nn.Module, region_manager: RegionManager, is_sync=True
self.model = model
self.region_manager = region_manager
self.grad_hook_list = []
self.overflow_counter = torch.cuda.IntTensor([0])
self.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_current_device())

self.grad_offload_stream = torch.cuda.current_stream() if is_sync else GlobalRuntimeInfo.d2h_stream

Expand Down
5 changes: 3 additions & 2 deletions colossalai/booster/plugin/torch_ddp_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from colossalai.cluster import DistCoordinator
from colossalai.interface import ModelWrapper, OptimizerWrapper
from colossalai.quantization import BnbQuantizationConfig, quantize_model
from colossalai.utils import get_current_device

from .dp_plugin_base import DPPluginBase

Expand Down Expand Up @@ -203,7 +204,7 @@ def control_device(self) -> bool:
return True

def supported_devices(self) -> List[str]:
return ["cuda"]
return ["cuda", "npu"]

def configure(
self,
Expand All @@ -214,7 +215,7 @@ def configure(
lr_scheduler: Optional[LRScheduler] = None,
) -> Tuple[nn.Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
# cast model to cuda
model = model.cuda()
model = model.to(get_current_device())

# convert model to sync bn
model = nn.SyncBatchNorm.convert_sync_batchnorm(model, None)
Expand Down
2 changes: 1 addition & 1 deletion colossalai/inference/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer

#launch distributed environment
colossalai.launch_from_torch(config={})
colossalai.launch_from_torch()

# load original model and tokenizer
model = LlamaForCausalLM.from_pretrained("/path/to/model")
Expand Down
16 changes: 1 addition & 15 deletions colossalai/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,15 @@
# -*- encoding: utf-8 -*-

import os
import warnings
from pathlib import Path
from typing import Dict, Union

import torch.distributed as dist

from colossalai.accelerator import get_accelerator
from colossalai.context import Config
from colossalai.logging import get_dist_logger
from colossalai.utils import set_seed


def launch(
config: Union[str, Path, Config, Dict],
rank: int,
world_size: int,
host: str,
Expand Down Expand Up @@ -44,8 +39,6 @@ def launch(
Raises:
Exception: Raise exception when config type is wrong
"""
if rank == 0:
warnings.warn("`config` is deprecated and will be removed soon.")

cur_accelerator = get_accelerator()

Expand All @@ -68,7 +61,6 @@ def launch(


def launch_from_slurm(
config: Union[str, Path, Config, Dict],
host: str,
port: int,
backend: str = "nccl",
Expand All @@ -95,7 +87,6 @@ def launch_from_slurm(
)

launch(
config=config,
rank=rank,
world_size=world_size,
host=host,
Expand All @@ -107,7 +98,6 @@ def launch_from_slurm(


def launch_from_openmpi(
config: Union[str, Path, Config, Dict],
host: str,
port: int,
backend: str = "nccl",
Expand Down Expand Up @@ -135,7 +125,6 @@ def launch_from_openmpi(
)

launch(
config=config,
local_rank=local_rank,
rank=rank,
world_size=world_size,
Expand All @@ -147,9 +136,7 @@ def launch_from_openmpi(
)


def launch_from_torch(
config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024, verbose: bool = True
):
def launch_from_torch(backend: str = "nccl", seed: int = 1024, verbose: bool = True):
"""A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch

Expand All @@ -171,7 +158,6 @@ def launch_from_torch(
)

launch(
config=config,
local_rank=local_rank,
rank=rank,
world_size=world_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def setup(self, world_size, rank, port):
# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
collective.init_collective_group(world_size, rank, "nccl", "default")
# initialize and set distributed environment
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
log_cuda_info("Worker.setup")

Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/inference/hybridengine/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class CaiInferEngine:
import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer

colossalai.launch_from_torch(config={})
colossalai.launch_from_torch()

model = LlamaForCausalLM.from_pretrained("your_path_to_model")
tokenizer = LlamaTokenizer.from_pretrained("/home/lczyh/share/models/llama-7b-hf")
Expand Down
34 changes: 17 additions & 17 deletions colossalai/legacy/inference/pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
import colossalai
from transformers import LlamaForCausalLM, LlamaTokenizer

colossalai.launch_from_torch(config={})
colossalai.launch_from_torch()

model = LlamaForCausalLM.from_pretrained("/path/to/model")
tokenizer = LlamaTokenizer.from_pretrained("/path/to/model")
Expand All @@ -57,27 +57,27 @@ We conducted multiple benchmark tests to evaluate the performance. We compared t
### Llama Throughput (tokens/s) | input length=1024, output length=128

#### A10 7b, fp16
| batch_size(micro_batch size)| 2(1) | 4(2) | 8(4) | 16(8) | 32(8) | 32(16)|
| :---: | :---: | :---: | :---: | :---: | :---: | :---:|
| Pipeline Inference | 40.35 | 77.1 | 139.03 | 232.7 | 257.81 | OOM |
| Hugging Face | 41.43 | 65.30 | 91.93 | 114.62 | OOM| OOM |
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(8) | 32(16) |
|:----------------------------:|:-----:|:-----:|:------:|:------:|:------:|:------:|
| Pipeline Inference | 40.35 | 77.1 | 139.03 | 232.7 | 257.81 | OOM |
| Hugging Face | 41.43 | 65.30 | 91.93 | 114.62 | OOM | OOM |

#### A10 13b, fp16
| batch_size(micro_batch size)| 2(1) | 4(2) | 8(4) | 16(4) |
| :---: | :---: | :---: | :---: | :---: |
| Pipeline Inference | 25.39 | 47.09 | 83.7 | 89.46 |
| Hugging Face | 23.48 | 37.59 | 53.44 | OOM |
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(4) |
|:----------------------------:|:-----:|:-----:|:-----:|:-----:|
| Pipeline Inference | 25.39 | 47.09 | 83.7 | 89.46 |
| Hugging Face | 23.48 | 37.59 | 53.44 | OOM |


#### A800 7b, fp16
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
| :---: | :---: | :---: | :---: | :---: | :---: |
| Pipeline Inference| 57.97 | 110.13 | 213.33 | 389.86 | 670.12 |
| Hugging Face | 42.44 | 76.5 | 151.97 | 212.88 | 256.13 |
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
|:----------------------------:|:-----:|:------:|:------:|:------:|:------:|
| Pipeline Inference | 57.97 | 110.13 | 213.33 | 389.86 | 670.12 |
| Hugging Face | 42.44 | 76.5 | 151.97 | 212.88 | 256.13 |


#### A800 13b, fp16
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
| :---: | :---: | :---: | :---: | :---: | :---: |
| Pipeline Inference | 41.78 | 94.18 | 172.67| 310.75| 470.15 |
| Hugging Face | 36.57 | 68.4 | 105.81 | 139.51 | 166.34 |
| batch_size(micro_batch size) | 2(1) | 4(2) | 8(4) | 16(8) | 32(16) |
|:----------------------------:|:-----:|:-----:|:------:|:------:|:------:|
| Pipeline Inference | 41.78 | 94.18 | 172.67 | 310.75 | 470.15 |
| Hugging Face | 36.57 | 68.4 | 105.81 | 139.51 | 166.34 |
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
GIGABYTE = 1024**3
MEGABYTE = 1024 * 1024

colossalai.launch_from_torch(config={})
colossalai.launch_from_torch()


def data_gen(batch_size: int = 4, seq_len: int = 512):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def setup(self, world_size, rank, port):
# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
collective.init_collective_group(world_size, rank, "nccl", "default")
# initialize and set distributed environment
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
log_cuda_info("Worker.setup")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def initialize(self, ctx):
self.model.cuda()
self.model.eval()

colossalai.launch(config={}, rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
colossalai.launch(rank=rank, world_size=world_size, host=host, port=port, backend="nccl")
logger.info("Initializing TPInferEngine ...")
shard_config = ShardConfig(
enable_tensor_parallelism=True if self.tp_size > 1 else False, extra_kwargs={"inference_only": True}
Expand Down
2 changes: 1 addition & 1 deletion colossalai/legacy/pipeline/rpc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def run_worker(rank, args, master_func):
port = args.master_port
backend = "nccl" if device == "cuda" else "gloo"

launch(dict(), rank, world_size, host, int(port), backend, verbose=False)
launch(rank, world_size, host, int(port), backend, verbose=False)
ppg.set_global_info(
rank=rank,
world_size=world_size,
Expand Down
Loading
Loading