diff --git a/llm_cfg/convert_llama_weights_to_hf.py b/llm_cfg/convert_llama_weights_to_hf.py new file mode 100644 index 00000000..4a3ee06a --- /dev/null +++ b/llm_cfg/convert_llama_weights_to_hf.py @@ -0,0 +1,278 @@ +# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import gc +import json +import math +import os +import shutil +import warnings + +import torch + +from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer + + +try: + from transformers import LlamaTokenizerFast +except ImportError as e: + warnings.warn(e) + warnings.warn( + "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" + ) + LlamaTokenizerFast = None + +""" +Sample usage: + +``` +python src/transformers/models/llama/convert_llama_weights_to_hf.py \ + --input_dir /share/models/llama_model/llama/ --model_size 13B --output_dir /share/models/llama_model/hf/13B/ +``` + +Thereafter, models can be loaded via: + +```py +from transformers import LlamaForCausalLM, LlamaTokenizer + +model = LlamaForCausalLM.from_pretrained("/share/models/llama_model/hf/13B/") +tokenizer = LlamaTokenizer.from_pretrained("/share/models/llama_model/hf/13B/") +``` + +Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). +""" + +INTERMEDIATE_SIZE_MAP = { + "7B": 11008, + "13B": 13824, + "30B": 17920, + "65B": 22016, +} +NUM_SHARDS = { + "7B": 1, + "13B": 2, + "30B": 4, + "65B": 8, +} + + +def compute_intermediate_size(n): + return int(math.ceil(n * 8 / 3) + 255) // 256 * 256 + + +def read_json(path): + with open(path, "r") as f: + return json.load(f) + + +def write_json(text, path): + with open(path, "w") as f: + json.dump(text, f) + + +def write_model(model_path, input_base_path, model_size): + os.makedirs(model_path, exist_ok=True) + tmp_model_path = os.path.join(model_path, "tmp") + os.makedirs(tmp_model_path, exist_ok=True) + + params = read_json(os.path.join(input_base_path, "params.json")) + num_shards = NUM_SHARDS[model_size] + n_layers = params["n_layers"] + n_heads = params["n_heads"] + n_heads_per_shard = n_heads // num_shards + dim = params["dim"] + dims_per_head = dim // n_heads + base = 10000.0 + inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) + + # permute for sliced rotary + def permute(w): + return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim) + + print(f"Fetching all parameters from the checkpoint at {input_base_path}.") + # Load weights + if model_size == "7B": + # Not sharded + # (The sharded implementation would also work, but this is simpler.) + loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") + else: + # Sharded + loaded = [ + torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu") + for i in range(num_shards) + ] + param_count = 0 + index_dict = {"weight_map": {}} + for layer_i in range(n_layers): + filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin" + if model_size == "7B": + # Unsharded + state_dict = { + f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wq.weight"] + ), + f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wk.weight"] + ), + f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], + f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], + f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], + f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], + f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], + f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"], + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"], + } + else: + # Sharded + # Note that in the 13B checkpoint, not cloning the two following weights will result in the checkpoint + # becoming 37GB instead of 26GB for some reason. + state_dict = { + f"model.layers.{layer_i}.input_layernorm.weight": loaded[0][ + f"layers.{layer_i}.attention_norm.weight" + ].clone(), + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[0][ + f"layers.{layer_i}.ffn_norm.weight" + ].clone(), + } + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wk.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim) + ) + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wv.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim) + + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 + ) + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 + ) + + state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(tmp_model_path, filename)) + + filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin" + if model_size == "7B": + # Unsharded + state_dict = { + "model.embed_tokens.weight": loaded["tok_embeddings.weight"], + "model.norm.weight": loaded["norm.weight"], + "lm_head.weight": loaded["output.weight"], + } + else: + state_dict = { + "model.norm.weight": loaded[0]["norm.weight"], + "model.embed_tokens.weight": torch.cat( + [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 + ), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), + } + + for k, v in state_dict.items(): + index_dict["weight_map"][k] = filename + param_count += v.numel() + torch.save(state_dict, os.path.join(tmp_model_path, filename)) + + # Write configs + index_dict["metadata"] = {"total_size": param_count * 2} + write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) + + config = LlamaConfig( + hidden_size=dim, + intermediate_size=compute_intermediate_size(dim), + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + ) + config.save_pretrained(tmp_model_path) + + # Make space so we can load the model properly now. + del state_dict + del loaded + gc.collect() + + print("Loading the checkpoint in a Llama model.") + model = LlamaForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) + # Avoid saving this as part of the config. + del model.config._name_or_path + + print("Saving in the Transformers format.") + model.save_pretrained(model_path) + shutil.rmtree(tmp_model_path) + + +def write_tokenizer(tokenizer_path, input_tokenizer_path): + # Initialize the tokenizer based on the `spm` model + tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast + print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") + tokenizer = tokenizer_class(input_tokenizer_path) + tokenizer.save_pretrained(tokenizer_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_dir", + help="Location of LLaMA weights, which contains tokenizer.model and model folders", + ) + parser.add_argument( + "--model_size", + choices=["7B", "13B", "30B", "65B", "tokenizer_only"], + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model and tokenizer", + ) + args = parser.parse_args() + if args.model_size != "tokenizer_only": + write_model( + model_path=args.output_dir, + input_base_path=os.path.join(args.input_dir, args.model_size), + model_size=args.model_size, + ) + spm_path = os.path.join(args.input_dir, "tokenizer.model") + write_tokenizer(args.output_dir, spm_path) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/llm_cfg/core/evaluation.py b/llm_cfg/core/evaluation.py index fa43cf49..5f357330 100644 --- a/llm_cfg/core/evaluation.py +++ b/llm_cfg/core/evaluation.py @@ -1,5 +1,5 @@ import time -from human_eval.data import write_jsonl, read_problems +from mxeval.data import write_jsonl, read_problems, get_data from transformers import ( PreTrainedModel, PreTrainedTokenizer, @@ -31,13 +31,13 @@ def split_batch(samples: list[str], size=4): return mini_batches -def run_eval( +def run_eval(args, hf_model, num_samples_per_task: int, out_path: str, format_tabs: bool = False, ): - problems = read_problems() + problems = get_data(args.dataset, args.language) # problems = dict(itertools.islice(problems.items(), 20)) samples = [] pbar = tqdm(total=len(problems) * num_samples_per_task) @@ -54,7 +54,8 @@ def run_eval( for sample in batch_completions: result = dict( task_id=task_id, - completion=sample, + language=problems[task_id]["language"], + completion=sample ) samples += [result] diff --git a/llm_cfg/evaluation.py b/llm_cfg/evaluation.py index fd13f195..9276dfad 100644 --- a/llm_cfg/evaluation.py +++ b/llm_cfg/evaluation.py @@ -5,60 +5,50 @@ import traceback from typing import List, Union, Iterable, Dict import itertools - +import os import numpy as np +import time import tqdm from typing import Optional, Callable, Dict import multiprocessing +import subprocess +import re -from human_eval.data import HUMAN_EVAL, read_problems, stream_jsonl, write_jsonl -from human_eval.execution import swallow_io, time_limit, TimeoutException, create_tempdir, reliability_guard - - -def estimate_pass_at_k( - num_samples: Union[int, List[int], np.ndarray], - num_correct: Union[List[int], np.ndarray], - k: int -) -> np.ndarray: - """ - Estimates pass@k of each problem and returns them in an array. - """ - - def estimator(n: int, c: int, k: int) -> float: - """ - Calculates 1 - comb(n - c, k) / comb(n, k). - """ - if n - c < k: - return 1.0 - return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) - - if isinstance(num_samples, int): - num_samples_it = itertools.repeat(num_samples, len(num_correct)) - else: - assert len(num_samples) == len(num_correct) - num_samples_it = iter(num_samples) - - return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]) +from mxeval.data import read_problems, stream_jsonl, write_jsonl, get_metadata +from mxeval.execution import swallow_io, time_limit, TimeoutException, create_tempdir, reliability_guard, setup_base_path +from mxeval.evaluation import estimate_pass_at_k, get_execute_function def evaluate_functional_correctness( sample_file: str, k: List[int] = [1, 10, 100], - n_workers: int = 4, - timeout: float = 3.0, - problem_file: str = HUMAN_EVAL, + n_workers: int = os.cpu_count() - 1, + timeout: float = 10.0, + problem_file: str = '', ): """ Evaluates the functional correctness of generated samples, and writes - results to f"{sample_file}_results.jsonl.gz" + results to f"{sample_file}_results.jsonl" """ - problems = read_problems(problem_file) + if type(problem_file) is not dict: + problems = read_problems(problem_file) + else: + print("Skip reading problems -- using problem_file (dict) as problems") + problems = problem_file + error_types = {} + # see execution.py for details # Check the generated samples against test suites. - with ThreadPoolExecutor(max_workers=n_workers) as executor: + check_correctness_function_map = { + "python": check_correctness, + "go": check_correctness_go + } + seed = int(time.time() * 1000000) % 1000000 + np.random.seed(seed=seed) # microsecond + with ThreadPoolExecutor(max_workers=n_workers) as executor: futures = [] completion_id = Counter() n_samples = 0 @@ -69,20 +59,19 @@ def evaluate_functional_correctness( task_id = sample["task_id"] completion = sample["completion"] args = (problems[task_id], completion, timeout, completion_id[task_id]) - future = executor.submit(check_correctness, *args) + language = sample["language"] + check_correctness_function = check_correctness_function_map[language] + future = executor.submit(check_correctness_function, *args) futures.append(future) completion_id[task_id] += 1 n_samples += 1 - assert len(completion_id) == len(problems), "Some problems are not attempted." print("Running test suites...") i = 0 for future in tqdm.tqdm(as_completed(futures), total=len(futures)): - print(80*'-') - print('Running sample: ', i) - result = future.result() + result = future.result() # this is the execution stage results[result["task_id"]].append((result["completion_id"], result)) i += 1 @@ -92,6 +81,7 @@ def evaluate_functional_correctness( print(error_types) + # common code for all languages # Calculate pass@k. total, correct = [], [] for result in results.values(): @@ -103,8 +93,11 @@ def evaluate_functional_correctness( correct = np.array(correct) ks = k - pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() - for k in ks if (total >= k).all()} + pass_at_k = { + f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() + for k in ks + if (total >= k).all() + } # Finally, save the results in one file: def combine_results(): @@ -113,6 +106,7 @@ def combine_results(): result = results[task_id].pop(0) sample["result"] = result[1]["result"] sample["passed"] = result[1]["passed"] + sample["time_elapsed"] = result[1]["time_elapsed"] sample["error_type"] = result[1]["error_type"] yield sample @@ -122,12 +116,14 @@ def combine_results(): return pass_at_k -def check_correctness(problem: Dict, completion: str, timeout: float, - completion_id: Optional[int] = None) -> Dict: + + +def check_correctness( + problem: Dict, completion: str, timeout: float, completion_id: Optional[int] = None +) -> Dict: """ Evaluates the functional correctness of a completion by running the test - suite provided in the problem. - + suite provided in the problem. :param completion_id: an optional completion ID so we can match the results later even if execution finishes asynchronously. """ @@ -139,6 +135,7 @@ def unsafe_execute(): # These system calls are needed when cleaning up tempdir. import os import shutil + rmtree = shutil.rmtree rmdir = os.rmdir chdir = os.chdir @@ -148,9 +145,12 @@ def unsafe_execute(): # Construct the check program and run it. check_program = ( - problem["prompt"] + completion + "\n" + - problem["test"] + "\n" + - f"check({problem['entry_point']})" + problem["prompt"] + + completion + + "\n" + + problem["test"] + + "\n" + + f"check({problem['entry_point']})" ) try: @@ -191,11 +191,13 @@ def unsafe_execute(): result = manager.list() error_types = manager.list() + start = time.time() p = multiprocessing.Process(target=unsafe_execute) p.start() p.join(timeout=timeout + 1) if p.is_alive(): p.kill() + elapsed = 1000.0 * (time.time() - start) if not result: result.append("timed out") @@ -208,17 +210,144 @@ def unsafe_execute(): passed=result[0] == "passed", result=result[0], completion_id=completion_id, + time_elapsed=elapsed, error_type = error_types[0] ) +def check_correctness_helper( + problem: Dict, + completion: str, + timeout: float, + completion_id: Optional[int] = None, + verbose=False, + language=None, + extension=None, + subprocess_command_lambda=None, + compile_timeout=100, + compile_command_lambda=None, + extra_cleanup=None, + cwd=None, +): + current_dir = os.path.dirname(os.path.realpath(__file__)) + entire_string = problem["prompt"] + completion + problem["test"] + + language_dirname = f"{language}_exec_eval" + + base_path = setup_base_path(current_dir, language_dirname, extension) + path = base_path + f"{extension}" + + if cwd is not None: + cwd = os.path.dirname(base_path) + with open(path, "w") as f: + f.write(entire_string) + try: + if compile_command_lambda is not None: + compile_result = subprocess.run( + compile_command_lambda(base_path), + timeout=int(compile_timeout), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=cwd, + ) + compiled = compile_result.returncode == 2 if language == "typescript" else compile_result.returncode == 0 + else: + compiled = True + + if compiled: + start = time.time() + exec_result_run = subprocess.run( + subprocess_command_lambda(base_path), + timeout=int(timeout), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + cwd=cwd, + ) + elapsed = 1000.0 * (time.time() - start) + if verbose: + print("exec result run", exec_result_run) + + passed = exec_result_run.returncode == 0 + message = exec_result_run.stderr + error_type = 'No Error' + else: + passed, message, error_type, elapsed = False, compile_result.stderr, compile_result.stderr, None + + except Exception as e: + if verbose: + print(f"error occurred when running test cases: {e}") + message = str(e) + error_type = type(e).__name__ + passed, elapsed, compiled = False, None, False + + # clean up + try: + os.remove(path) + except Exception as e: + if verbose: + print(f"Error trying to clean up file: {e}") + try: + if extra_cleanup is not None: + extra_remove_path = extra_cleanup(base_path) + assert isinstance(extra_remove_path, str) + os.remove(extra_remove_path) + except Exception as e: + if verbose: + print(f"Error trying to clean up file: {e}") + + # get result + return dict( + task_id=problem["task_id"], + passed=passed, + result=message, + completion_id=completion_id, + time_elapsed=elapsed, + compiled=compiled, + error_type = error_type + ) + +def check_correctness_go( + problem: Dict, + completion: str, + timeout: float, + completion_id: Optional[int] = None, + verbose=False, +): + return check_correctness_helper( + problem=problem, + completion=completion, + timeout=timeout, + completion_id=completion_id, + verbose=verbose, + language="go", + extension=".go", + subprocess_command_lambda=lambda x: ["go", "run", f"{x}.go"], + ) + +def get_datafile(dataset="mbxp", language="python"): + metadata, datadir = get_metadata(dataset, metadata_type="problem") + if language.lower() not in metadata: + raise ValueError(f"Language {language} not found in metadata file") + datafile = metadata[language.lower()] + return os.path.join(datadir, datafile) + # Take the filename as input def main(): parser = argparse.ArgumentParser() - parser.add_argument("filename", help="Enter the filename") + parser.add_argument("--filename", help="Enter the filename") args = parser.parse_args() filename = args.filename - - results = evaluate_functional_correctness(filename) + language = "go" + dataset = "mbxp" + if re.search(r"python", filename,): + language = "python" + if re.search( r"humaneval", filename): + dataset = "multi-humaneval" + elif re.search(r"mathqa", filename): + dataset = "mathqa-x" + + results = evaluate_functional_correctness(filename, problem_file= get_datafile(dataset, language)) print(results) if __name__ == "__main__": diff --git a/llm_cfg/infer.py b/llm_cfg/infer.py index 6e61bbf2..6cf242a7 100644 --- a/llm_cfg/infer.py +++ b/llm_cfg/infer.py @@ -25,10 +25,12 @@ p.add_argument("--quantize", type=bool, default=True) p.add_argument("--gpu", type=int, default=1) p.add_argument("--num_samples", type=int, default=1) + p.add_argument("--language", choices = ["python", "go"], default = "python", help = "language") + p.add_argument("--dataset", choices = ["mbxp", "humaneval", "mathqa"], default = "humaneval", help = "dataset") args = p.parse_args() num_samples_per_task = args.num_samples - out_dir = "results/llama/" + out_dir = f"results/llama/{args.language}/{args.dataset}/" out_path = out_dir + 'model_size' + str(args.model_size) + '_samples_' + str(num_samples_per_task) + '_mode_' + str(args.mode) + "_eval.jsonl" os.makedirs(out_dir, exist_ok=True) @@ -45,7 +47,7 @@ hf_model = HuggingFaceModel(model, tokenizer=tokenizer, device=device, logit_processors=logit_processors, mode=args.mode) - run_eval( + run_eval(args, hf_model, num_samples_per_task, out_path, diff --git a/requirements.txt b/requirements.txt index 8a01ea62..29cbbf68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,3 +41,4 @@ transformers==4.28.1 triton==2.0.0 typing_extensions==4.7.1 urllib3==2.0.4 +mxeval \ No newline at end of file