diff --git a/notebooks/eval_json.ipynb b/notebooks/eval_json.ipynb new file mode 100644 index 00000000..16019d8c --- /dev/null +++ b/notebooks/eval_json.ipynb @@ -0,0 +1,86 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/shubham/anaconda3/envs/codex/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 11.83it/s]\n" + ] + } + ], + "source": [ + "# Running Gemma-2-2b-it on JSON evaluation from nousResearch/json-mode-eval\n", + "import sys, os\n", + "sys.path.append(os.getcwd() + '/../')\n", + "from syncode import Syncode\n", + "\n", + "syn_llm = Syncode(\n", + " mode=\"grammar_mask\",\n", + " model=\"google/gemma-2-2b-it\",\n", + " grammar=\"json\",\n", + " max_new_tokens=400,\n", + " parser=\"lr\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 100/100 [07:50<00:00, 4.71s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result: {'pass@1': 0.99}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "result = syn_llm.evaluate(dataset=\"json_eval\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "codex", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/example_date.ipynb b/notebooks/example_date.ipynb index f5eb7a30..851e0007 100644 --- a/notebooks/example_date.ipynb +++ b/notebooks/example_date.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "id": "ffd1e5da", "metadata": {}, "outputs": [ @@ -10,17 +10,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading checkpoint shards: 0%| | 0/2 [00:00 dict: + """ + Run evaluation on the model: + + Args: + dataset (str): Dataset to evaluate on. Options are "mbxp", "humaneval", "mathqa-x", "gsm8k", "spider", "json_eval". + + num_few_shot (int, optional): Number of examples for few shot prompting. Defaults to 0. + + task_id (int, optional): For debugging a specific task. Defaults to None. + """ + if logger.is_closed: + logger.open() + + # Load the dataset + self.dataset = Dataset(dataset, language=self.grammar, num_few_shot=num_few_shot) if self.dataset.type == "code": - output = CodeEval.run_code_eval(self, self.num_samples, self.out_path, format_tabs=True, debug_task_id=task_id) + output = CodeEval.run_code_eval(self, self.num_samples, out_path, format_tabs=True, debug_task_id=task_id, logger=logger) elif self.dataset.type == "math": - output = MathEval.run_math_eval(self, debug_task_id=task_id) + output = MathEval.run_math_eval(self, out_path, debug_task_id=task_id, logger=logger) elif self.dataset.type == "sql": - output = SQLEval.run_eval(self) + output = SQLEval.run_eval(self, out_path) elif self.dataset.type == "fol": - output = FOLEval.run_eval(self, debug_task_id=task_id) - elif self.dataset.type == "input": - output = self.user_input(prompt, stop_words=stop_words) + output = FOLEval.run_eval(self, out_path, debug_task_id=task_id) elif self.dataset.type == "json": - output = JSONEval.run_json_eval(self, debug_task_id=task_id) + output = JSONEval.run_json_eval(self, out_path, debug_task_id=task_id, logger=logger) else: raise ValueError(f"Dataset type {self.dataset.type} not supported") - self.logger.close() + logger.close() return output - def get_output_path(self): - out_dir = f"results/{self.model_name}/{self.grammar}/{self.dataset}/" - out_path = out_dir + 'samples_' + str(self.num_samples) + '_mode_' + str(self.mode) + "_eval.jsonl" - os.makedirs(out_dir, exist_ok=True) - return out_dir,out_path - def user_input(self, prompt:str, stop_words=[]): """ Run user input on the model with grammar mask @@ -175,7 +187,6 @@ def user_input(self, prompt:str, stop_words=[]): return self.model.generate_chat_completion_grammar(prompt) else: return self.model.generate_batch_completion_grammar(prompt, self.num_samples, stop_words=stop_words) - else: while True: prompt = input('Enter prompt: ') diff --git a/syncode/language_model.py b/syncode/language_model.py index 476ff943..05d80271 100644 --- a/syncode/language_model.py +++ b/syncode/language_model.py @@ -35,7 +35,6 @@ def __init__( self, model: Callable, grammar: Grammar, - logger: common.Logger, tokenizer=None, prompt_template: str = '', best_of: int = 1, @@ -48,7 +47,6 @@ def __init__( self.prompt_template = prompt_template self.model = model - self.logger = logger self.tokenizer = tokenizer self.device = device self.best_of = best_of @@ -114,24 +112,19 @@ def generate_batch_completion_grammar(self, prompt, batch_size, stop_words=None) # TODO: Move this to CodeEval for i in range(batch_size): raw_completion = self.tokenizer.decode(generated_ids[i][input_ids_cutoff:len(generated_ids[i])], skip_special_tokens=True) - self.logger.log_code("Raw completion", raw_completion) # Post-processing to filter out using stop word (e.g. "\n\n") if self.grammar != None and self.grammar.name == "python": completion = self.postproces_completion_python(i, batch_size, input_ids_cutoff, generated_ids, self.grammar_decoder, raw_completion) - self.logger.log_code("Filtered sample", completion) elif self.grammar != None and self.grammar.name == "go": completion = self.postproces_completion_go(i, batch_size, raw_completion, generated_ids, self.grammar_decoder, input_ids_cutoff) - self.logger.log_code("Filtered sample", completion) else: # TODO: handle the case for other grammars completion = raw_completion batch_completions.append(completion) - self.logger.log(f"Completion: {batch_completions}") return batch_completions - @torch.inference_mode() def _generate( self, @@ -260,7 +253,6 @@ def generate_chat_completion_grammar(self, prompt) -> str: generated_ids[0][input_ids_cutoff:len(generated_ids[0])], skip_special_tokens=True) - self.logger.log_code("Raw completion", completion) return completion @@ -283,7 +275,6 @@ def postproces_completion_go(self, i, batch_size, raw_completion, generated_ids, else: # When the grammar_decoder is used function_incomplete = [False for _ in range(batch_size)] - self.logger.log(f"Function incomplete!") completion = self.compute_backup_completion(grammar_decoder, function_incomplete, i, input_ids_cutoff, generated_ids) if function_incomplete[i]: @@ -293,9 +284,6 @@ def postproces_completion_go(self, i, batch_size, raw_completion, generated_ids, return completion def compute_backup_completion(self, grammar_decoder, function_incomplete, i, input_ids_cutoff, generated_ids): - self.logger.log(f"Last valid state: {grammar_decoder.last_valid_state[i]}") - self.logger.log(f"Function end: {grammar_decoder.function_end[i]}") - if grammar_decoder.function_end[i] is not None: # if the function end is not None, then the last valid state is the function end last_token_id = grammar_decoder.function_end[i]