gzip_difficulty.py

# -*- coding: utf-8 -*-
"""gzip-difficulty.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/[REDACTED]

# How well does compressibility predict the learnability of a dataset?

- compressibility: gzipability ~= length of gzipped string / length of original string
- learnability: learning difficulty ~= integral of perplexity across training steps
- datasets will be synthetically generated by PCFGs and taken from standard natural language & code datasets
  - hopefully the real-world datasets are in the PCFG's gzipability distribution


Training data preparation [reference](https://huggingface.co/learn/nlp-course/chapter3/4)

## Setup
"""

# ! pip install nltk pcfg
# ! pip install accelerate -U
# ! pip install transformers[torch] datasets wandb

# Commented out IPython magic to ensure Python compatibility.
# ! wandb login --relogin # [REDACTED]

# # %env WANDB_ENTITY=rspandey
# # %env WANDB_PROJECT=LM-Training

"""## Load Model"""

from transformers import LlamaForCausalLM, LlamaConfig

configuration = {
    "vocab_size": 32001,
    "hidden_size": 256,
    "intermediate_size": 512,
    "num_hidden_layers": 4,
    "num_attention_heads": 4,
    "max_position_embeddings": 256,
}
context_length = configuration["max_position_embeddings"]

config = LlamaConfig(**configuration)
model = LlamaForCausalLM(config)

print(f"Param Count: {sum(p.numel() for p in model.parameters()) / 1_000_000:.1f}M")

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf", token="[REDACTED]"
)  # TODO: replace with actual model name

tokenizer.add_special_tokens({"pad_token": "<pad>"})
model.resize_token_embeddings(len(tokenizer))

"""## Real Data"""

from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from datasets import load_dataset, DatasetDict


def count_total_tokens(dataloader):
    total_tokens = 0
    for batch in dataloader:
        total_tokens += sum(batch["attention_mask"].flatten().tolist())
    return total_tokens


"""## CFG Data

https://www.nltk.org/api/nltk.grammar.PCFG.html

https://www.nltk.org/_modules/nltk/parse/generate.html
"""


def generate_probs(num_options):
    if num_options <= 0:
        raise ValueError("Number of options must be positive")

    # Generate random integers for each option
    random_ints = [random.randint(1, 100) for _ in range(num_options)]

    # Calculate the total sum
    total = sum(random_ints)

    # Normalize each integer by the total sum to get probabilities
    probs = [i / total for i in random_ints]

    return probs


import random
import math
from nltk import Nonterminal
from pcfg import PCFG


def create_random_pcfg(
    num_nonterminals,
    num_terminals,
    rhs_max_options=5,
    rhs_max_len=5,
    constrain_to_pfsa=False,
):
    # Create non-terminal symbols
    nonterminals = [f"N{i}" for i in range(num_nonterminals)]

    # Create terminal symbols as consecutive integers
    terminals = [f"'{i}'" for i in range(num_terminals)]

    # Initialize production rules
    productions = []

    for lhs in nonterminals:
        rhs_options_ct = random.randint(1, rhs_max_options)
        rhs_option_probs = generate_probs(rhs_options_ct)

        rhs_options = []

        for rhs_option_prob in rhs_option_probs:
            rhs = []

            if constrain_to_pfsa:
                rhs.append(
                    random.choice(nonterminals + terminals)
                )  # TODO: is this the right constraint?
            else:
                # Randomly decide the length of the right-hand side (at least 1)
                rhs_len = random.randint(1, rhs_max_len)
                for _ in range(rhs_len):
                    rhs.append(random.choice(nonterminals + terminals))

            rhs_option = f"{' '.join(rhs)} [{rhs_option_prob}]"
            rhs_options.append(rhs_option)

        production = f"{lhs} -> {' | '.join(rhs_options)}"
        productions.append(production)

    start_production = f"S -> {' | '.join([f'{nonterminal} [{1/len(nonterminals)}]' for nonterminal in nonterminals])}"
    productions.insert(0, start_production)

    # Create the PCFG
    grammar = PCFG.fromstring("\n".join(productions))

    return grammar


def generate_dataset(
    num_nonterminals,
    num_terminals,
    rhs_max_options,
    rhs_max_len,
    constrain_to_pfsa,
    num_toks_total,
    num_toks_per_seq=context_length,
):
    grammar = create_random_pcfg(
        num_nonterminals,
        num_terminals,
        rhs_max_options=rhs_max_options,
        rhs_max_len=rhs_max_len,
        constrain_to_pfsa=constrain_to_pfsa,
    )

    dataset = []
    total_tokens_generated = 0

    while total_tokens_generated < num_toks_total:
        document_tokens = 0
        document = []

        while document_tokens < num_toks_per_seq:
            try:
                sentence = next(grammar.generate(1))
            except RecursionError:
                continue
            except StopIteration:
                break  # No more sentences can be generated

            sentence_token_count = sentence.count(" ") + 2

            available_space = num_toks_per_seq - document_tokens
            if sentence_token_count <= available_space:
                document.append(sentence)
                document_tokens += sentence_token_count
            else:
                # Split the sentence into words and add words until the document is full
                words = sentence.split()
                words_to_add = words[:available_space]
                truncated_sentence = " ".join(words_to_add)

                document.append(truncated_sentence)
                document_tokens += len(words_to_add)

            if document_tokens == num_toks_per_seq:
                break

        if document:
            dataset.append(" 0 ".join(document))
            total_tokens_generated += document_tokens

        if total_tokens_generated >= num_toks_total or not document:
            break  # Stop if we've met the total token count or can't generate more documents

    return dataset


dataset_stats = [
    (5, 50, 3, 2, False),
    (10, 150, 5, 3, False),
    (20, 300, 10, 5, False),
    (50, 600, 30, 15, False),
    (100, 2000, 100, 30, False),
]
pcfg_datasets = [generate_dataset(*row, 1_000_000) for row in dataset_stats]

from datasets import Dataset


def pad_and_mask(sequence, sequence_length):
    if sequence_length - len(sequence) == 0:
        padded_sequence = sequence
    elif sequence_length - len(sequence) > 0:
        padded_sequence = sequence + [32000] * (sequence_length - len(sequence))
    elif sequence_length - len(sequence) < 0:
        padded_sequence = sequence[:sequence_length]
    mask = [1 if token != 32000 else 0 for token in padded_sequence]
    return padded_sequence, mask


def pcfg_dataset_to_dataloader(pcfg_dataset, batch_size=8, padder_tokenizer=tokenizer):
    tok_seqs = [[int(tok) for tok in doc.split(" ")] for doc in pcfg_dataset]

    input_ids, attention_masks = [], []
    for seq in tok_seqs:
        padded_seq, mask = pad_and_mask(seq, context_length)
        input_ids.append(padded_seq)
        attention_masks.append(mask)

    tokenized_dataset = Dataset.from_dict(
        {"input_ids": input_ids, "attention_mask": attention_masks}
    )
    tokenized_dataset = tokenized_dataset.map(
        lambda x: {"labels": x["input_ids"].copy()}, batched=True
    )
    tokenized_dataset.set_format("torch")

    data_collator = DataCollatorWithPadding(tokenizer=padder_tokenizer)

    dataloader = DataLoader(
        tokenized_dataset, shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )

    return dataloader


"""## gzip"""

import gzip
import io
from typing import List, Union


def calculate_gzipability(
    input_data: Union[str, List[int]], gzip_toks: bool = True
) -> int:
    if type(input_data) == str and not gzip_toks:
        input_bytes = input_data.encode("utf-8")
    else:  # token list
        if type(input_data) == str:
            input_data = [int(tok) for tok in input_data.split(" ")]
        input_bytes = b"".join(
            int.to_bytes(i, length=4, byteorder="big", signed=True) for i in input_data
        )

    buf = io.BytesIO()
    with gzip.GzipFile(fileobj=buf, mode="wb") as f:
        f.write(input_bytes)

    compressed_size = buf.tell()
    gzipability = compressed_size / len(input_bytes)

    return gzipability


from statistics import median, stdev


def calculate_median_stdev_gzipability(pcfg_dataset):
    gzipability_scores = [
        calculate_gzipability([int(tok) for tok in row.split(" ")])
        for row in pcfg_dataset
    ]
    med = median(gzipability_scores)

    if len(gzipability_scores) > 1:
        std_dev = stdev(gzipability_scores)
    else:
        std_dev = 0  # Default to 0 if there's only one element to avoid division by zero in stdev calculation

    return med, std_dev


for i, pcfg_dataset in enumerate(pcfg_datasets):
    med, std = calculate_median_stdev_gzipability(pcfg_dataset)
    total_toks = count_total_tokens(pcfg_dataset_to_dataloader(pcfg_dataset))

    print(
        f"{i}: {med:.3f} +- {std:.3f} ({total_toks})  | [{' '.join([str(x) for x in dataset_stats[i]])}]"
    )

"""## Training

Train on 2 synthetic datasets of similar token count but diff gzipability medians; compare perplexity sum over N epochs.

TODO:
- ensure I don't have train data in the validation set (how many unique sentences is the grammar generating)
- model is unnecessarily large since vocab size is 32001
  - set padder_tokenizer for pcfg dataloader during each training run based on terminal_ct of pcfg_dataset
- pass name and run hyperparams to wandb

"""

import numpy as np
from torch.nn import CrossEntropyLoss


def compute_perplexity(dataloader, model, device="cuda"):
    # adapted from: https://github.com/huggingface/evaluate/blob/main/metrics/perplexity/perplexity.py
    model = model.to(device)

    ppls = []
    loss_fct = CrossEntropyLoss(reduction="none")

    for batch in dataloader:
        batch.to(device)
        encoded_batch = batch["input_ids"]
        attn_mask = batch["attention_mask"]

        labels = encoded_batch

        with torch.no_grad():
            out_logits = model(encoded_batch, attention_mask=attn_mask).logits

        shift_logits = out_logits[
            ..., :-1, :
        ].contiguous()  # TODO: double check that all this logic is correct
        shift_labels = labels[..., 1:].contiguous()
        shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

        perplexity_batch = torch.exp(
            (
                loss_fct(shift_logits.transpose(1, 2), shift_labels)
                * shift_attention_mask_batch
            ).sum(1)
            / shift_attention_mask_batch.sum(1)
        )

        ppls += perplexity_batch.tolist()

    return np.mean(ppls)


from tqdm.auto import tqdm


def run_training(model, train_dataloader, valid_dataloader, num_epochs=10):
    train_perplexities = []
    valid_perplexities = []

    for epoch in range(num_epochs):
        progress_bar = tqdm(
            range(len(train_dataloader)), desc=f"Epoch {epoch + 1}/{num_epochs}"
        )

        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        train_perplexity = compute_perplexity(train_dataloader, model)
        train_perplexities.append(train_perplexity)

        model.eval()
        with torch.no_grad():
            valid_perplexity = compute_perplexity(valid_dataloader, model)
            valid_perplexities.append(valid_perplexity)

        print(
            f"Epoch {epoch}: Training Perplexity: {train_perplexity}, Validation Perplexity: {valid_perplexity}"
        )

    return train_perplexities, valid_perplexities


import torch

med_std_gzips = [
    calculate_median_stdev_gzipability(pcfg_dataset) for pcfg_dataset in pcfg_datasets
]

model_sizes = {
    "hidden_size": [64, 128, 256, 512, 1024],
    "intermediate_size": [128, 256, 512, 1024, 2048],
    "num_hidden_layers": [2, 4, 6, 10, 20],
    "num_attention_heads": [1, 2, 4, 8, 16],
}

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from transformers import AdamW
import json

results = []

for data_portion in (0.01, 0.1, 0.2, 0.5, 0.95):
    for i, pcfg_dataset in enumerate(pcfg_datasets):
        med_gzip, std_gzip = med_std_gzips[i]

        train_data_size = int(len(pcfg_dataset) * data_portion)
        valid_data_size = min(100, int(train_data_size / 10))
        train_dataloader = pcfg_dataset_to_dataloader(pcfg_dataset[:train_data_size])
        valid_dataloader = pcfg_dataset_to_dataloader(pcfg_dataset[-valid_data_size:])
        train_token_ct = count_total_tokens(train_dataloader)

        for j in range(len(list(model_sizes.values())[0])):
            print("-" * 20)

            model_stats = {key: val[j] for key, val in model_sizes.items()}
            model_config_dict = {
                **configuration,
                **model_stats,
            }  # NOTE: update vocab_size and new tokenizer?
            model_config = LlamaConfig(**model_config_dict)
            model = LlamaForCausalLM(model_config)
            model_size = sum(p.numel() for p in model.parameters())

            print(
                f"Dataset Stats: {med_gzip:.3f} +- {std_gzip:.3f} | {dataset_stats[i]}"
            )
            print(f"Model Size: {model_size/1_000_000:.1f}M")
            print(f"Train Token Count: {train_token_ct}")

            model.to(device)
            optimizer = AdamW(model.parameters(), lr=5e-5)
            num_epochs = 10

            train_perplexities, valid_perplexities = run_training(
                model, train_dataloader, valid_dataloader, num_epochs=num_epochs
            )

            row = {
                "dataset_stats": dataset_stats[i],
                "dataset_gzip": (med_gzip, std_gzip),
                "token_ct": train_token_ct,
                "model_stats": model_config_dict,
                "model_size": model_size,
                "num_epochs": num_epochs,
                "train_pplx": train_perplexities,
                "valid_pplx": valid_perplexities,
            }
            results.append(row)

            with open("results.jsonl", "a") as file:
                file.write(json.dumps(row) + "\n")