import math
import random
import pickle
import requests

import numpy as np

import jax.numpy as jnp
import jax

# Load in GPT parameters, we won't be training this guy though
f = open("gpt.pickle", 'rb')
encoder, hparams, params = pickle.loads(f.read())

# Show what those look like
def show_params(head, indent_level=0):
    if isinstance(head, dict):
        for i in head:
            print("  "*indent_level, end='')
            print(i)
            show_params(head[i], indent_level=indent_level+1)
    elif isinstance(head, list):
        i = head[11]
        print("  "*indent_level, end='')
        show_params(i, indent_level=indent_level+1)
    else:
        print("  "*indent_level, end='')
        print(head.shape)

# Or we could make our own
def generate_random_parameters(embedding_length, vocab_size, context_size, num_blocks, proj_width=4):
    # What sort of random are we feeling today?
    initializer = np.random.normal

    # These ones are nice and simple aren't they?
    wpe = initializer(size=[context_size, embedding_length])
    wte = initializer(size=[vocab_size, embedding_length])

    # A little trickyer
    ln_f = {'b': np.random.normal(size=[embedding_length]), 'g': np.random.normal(size=[embedding_length])}

    # Okay here we go
    blocks = []
    for _ in range(num_blocks):
        # Build the multi-head attention
        c_attn = {'b': initializer(size=[3*embedding_length]), 'w': initializer(size=[embedding_length, 3*embedding_length])}
        c_proj = {'b': initializer(size=[embedding_length]), 'w': initializer(size=[embedding_length, embedding_length])}
        attn = {'c_attn': c_attn, 'c_proj': c_proj}

        # Build a multilayer perceptron
        ln_1 = {'b': initializer(size=[embedding_length]), 'g': initializer(size=[embedding_length])}
        ln_2 = {'b': initializer(size=[embedding_length]), 'g': initializer(size=[embedding_length])}
        
        mlp_c_fc = {'b': initializer(size=[proj_width*embedding_length]), 'w': initializer(size=[embedding_length, proj_width*embedding_length])}
        mlp_c_proj = {'b': initializer(size=[embedding_length]), 'w': initializer(size=[proj_width*embedding_length, embedding_length]) }
        
        mlp = {'c_fc': mlp_c_fc, 'c_proj': mlp_c_proj }

        # Finally build the block
        block = {'attn': attn, 'ln_1': ln_1, 'ln_2': ln_2, 'mlp' : mlp }

        # And add it to our list of blocks
        blocks.append(block)

    # Finally we have everything we need to make a new set of parameters
    new_params = { 'blocks': blocks, 'ln_f': ln_f, 'wpe': wpe, 'wte': wte }
    return new_params

random_params = generate_random_parameters(hparams['n_embd'], hparams['n_vocab'], hparams['n_ctx'], hparams['n_layer'], proj_width=4)

# I want it on record that I *did not* design this param dict storage system...
def update_attention_block(og_attn_block, gd_attn_block, lr):
    for param1 in og_attn_block:
        for param2 in og_attn_block[param1]:
            if isinstance(og_attn_block[param1][param2], dict):
                for param3 in og_attn_block[param1][param2]:
                    og_attn_block[param1][param2][param3] -= gd_attn_block[param1][param2][param3]*lr
            else:
                og_attn_block[param1][param2] -= gd_attn_block[param1][param2]*lr

def update_parameters(og_params, gd_params, lr=0.1):
    for param1 in og_params:
        if isinstance(og_params[param1], list):
            for i in range(len(og_params[param1])):
                update_attention_block(og_params[param1][i], gd_params[param1][i], lr)
                
        elif isinstance(og_params[param1], dict):
            for param2 in og_params[param1]:
                og_params[param1][param2] -= gd_params[param1][param2]*lr
        else:
            og_params[param1] -= gd_params[param1]*lr

def gelu(x):
    return 0.5 * x * (1 + jnp.tanh(jnp.sqrt(2 / jnp.pi) * (x + 0.044715 * x**3)))

def softmax(x):
    exp_x = jnp.exp(x - jnp.max(x, axis=-1, keepdims=True))
    return exp_x / jnp.sum(exp_x, axis=-1, keepdims=True)

def layer_norm(x, g, b, eps: float = 1e-5):
    mean = jnp.mean(x, axis=-1, keepdims=True)
    variance = jnp.var(x, axis=-1, keepdims=True)
    return g * (x - mean) / jnp.sqrt(variance + eps) + b

def linear(x, w, b):
    return x @ w + b

def ffn(x, c_fc, c_proj):
    return linear(gelu(linear(x, **c_fc)), **c_proj)

def attention(q, k, v, mask):
    return softmax(q @ k.T / jnp.sqrt(q.shape[-1]) + mask) @ v

def mha(x, c_attn, c_proj, n_head):
    x = linear(x, **c_attn)
    qkv_heads = list(map(lambda x: jnp.split(x, n_head, axis=-1), jnp.split(x, 3, axis=-1)))
    causal_mask = (1 - jnp.tri(x.shape[0], dtype=x.dtype)) * -1e10
    out_heads = [attention(q, k, v, causal_mask) for q, k, v in zip(*qkv_heads)]
    x = linear(jnp.hstack(out_heads), **c_proj)
    return x

def transformer_block(x, mlp, attn, ln_1, ln_2, n_head):
    x = x + mha(layer_norm(x, **ln_1), **attn, n_head=n_head)
    x = x + ffn(layer_norm(x, **ln_2), **mlp)
    return x

def gpt2(inputs, wte, wpe, blocks, ln_f, n_head):
    # I know this line is ugly (and probably slow) but it's needed to make
    # this whole function differentiable, which is needed to, y'know
    # differentiate it
    x = jnp.array([ wte[i] + 0.0 for i in inputs ]) + wpe[0:len(inputs)]

    for block in blocks:
        x = transformer_block(x, **block, n_head=n_head)
        
    return layer_norm(x, **ln_f) @ wte.T

def _generate(inputs, params, n_head, n_tokens_to_generate):
    from tqdm import tqdm
    for _ in tqdm(range(n_tokens_to_generate), "generating"):
        logits = gpt2(inputs, **params, n_head=n_head)
        next_id = jnp.argmax(logits[-1])
        inputs.append(int(next_id))
    return inputs[len(inputs) - n_tokens_to_generate :]

def generate(encoder, hparams, params, prompt: str, n_tokens_to_generate: int = 40):
    input_ids = encoder.encode(prompt)
    assert len(input_ids) + n_tokens_to_generate < hparams["n_ctx"]
    output_ids = _generate(input_ids, params, hparams["n_head"], n_tokens_to_generate)
    output_text = encoder.decode(output_ids)
    return output_text

generate(encoder, hparams, params, "who are you")

generating:   0%|          | 0/40 [00:00<?, ?it/s]An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.
generating: 100%|██████████| 40/40 [00:29<00:00,  1.36it/s]

'?"\n\n"I\'m not sure. I\'m not sure if I\'m going to be able to do it. I\'m not sure if I\'m going to be able to do it. I'

neuromancer_raw = requests.get("https://gist.githubusercontent.com/m-242/ecb3e130b76a3b12f7ef41b04f486405/raw/8a3e992841f55f33b9836631b62ac0250b5fe7f8/neuromancer.txt")
neuromancer_raw = neuromancer_raw.content
neuromancer_raw[700:800]

b'was tending bar, his prosthetic arm jerking monoto-\nnously as he filled a tray of glasses with draft'

# Fix up the encoding
neuromancer = str(neuromancer_raw, encoding='ascii')
neuromancer[500:600]

's massive drug defi-\nciency." It was a Sprawl voice and a Sprawl joke. The Chatsubo\nwas a bar for pr'

# We'll take it nice and slow
# because we're gonna do a lot of string shit and that's slow
def preprocess_text(text):
    # Yeah I'm an anti-capitalist how could you tell?
    text = text.lower().replace("\n", " ").replace("- ", "").replace("'", " ' ")

    # I love my puter all my friends are in it 
    friends = "abcdefghijklmnopqrstuvwxyz  .'"
    
    # I HATE MY PUTER all my ENEMIES are in it
    clean = ""
    for i in text:
        if i in friends:
            clean += i

    # Period will be his own token, I'm not THAT insane
    clean = clean.replace(". ", " . ")
    
    # I remain ambivalent about my puter, it contains multitudes, friends, enemies, love, hate
    return clean

raw_training_data = preprocess_text(neuromancer)
tokenized_training_data = encoder.encode(raw_training_data)

def lm_loss(p, inputs: list[int], n_heads=12) -> float:       
    x = list(inputs[:-1])
    y = inputs[-1]

    if len(x) < 9:
        print(x) 
        raise ValueError("FUCK") # you don't wanna know
             
    output = gpt2(x, **p, n_head=n_heads)
    loss = -output[-1][y]
    return loss

lm_loss(params, tokenized_training_data[500:550], n_heads=6)

Array(69.26208, dtype=float32)

lm_loss_grad = jax.grad(lm_loss)

f = lm_loss_grad(params, tokenized_training_data[500:550], n_heads=6)

def train(p: dict, inputs: list[int], samples=20, min_sample_size=10, max_sample_size=20, lr=1e-5, n_heads=12) -> dict:
    print("training", end='')
    for _ in range(samples):
        # make sure the user known we're still alive. and that when they are dead
        # we will be still alive.
        print(".", end='')
        
        # Take a random sample
        i = random.randint(0, len(inputs) - max_sample_size -1)
        sample_size = random.randint(min_sample_size, max_sample_size)
        x = inputs[i:i+sample_size]
        
        # Compute it's gradient
        gd = lm_loss_grad(p, x, n_heads=n_heads)

        # Update it
        update_parameters(p, gd, lr=lr)
        
    print("\n")

n_hparams = {'n_vocab': 50257, 'n_ctx': 128, 'n_embd': 120, 'n_head': 6, 'n_layer': 3}
#n_hparams = hparams

params = generate_random_parameters(
    embedding_length=n_hparams["n_embd"],
    vocab_size=n_hparams["n_vocab"],
    context_size=n_hparams['n_ctx'],
    num_blocks=n_hparams['n_layer'],
    proj_width=4
)
                                    
generate(encoder, n_hparams, params, "who are you", n_tokens_to_generate=10)

generating: 100%|██████████| 10/10 [00:00<00:00, 40.80it/s]

' Sith 77 teachwrap Alphwrap millennia Alph Alphwrap'

lm_loss(params, tokenized_training_data[500:520], n_heads=6)

Array(-1.8531859, dtype=float32)

train(params, tokenized_training_data[500:520], samples=20, min_sample_size=10, max_sample_size=10, lr=0.1, n_heads=6)

training....................

lm_loss(params, tokenized_training_data[500:520], n_heads=6)

Array(-2230.628, dtype=float32)

generate(encoder, n_hparams, params, "he scratched his overhang of ", n_tokens_to_generate=10)

generating: 100%|██████████| 10/10 [00:00<00:00, 57.01it/s]

' his whites his whites his whites his whites his whites'

encoder.decode(tokenized_training_data[500:520])

' ratz grunted the sound served him as laughter . he scratched his overhang of whiteshirted'

encoder.decode(tokenized_training_data[500:520])

' ratz grunted the sound served him as laughter . he scratched his overhang of whiteshirted'

hist = {}
for i in tokenized_training_data:
    try:
        hist[i] += 1
    except KeyError as e:
        hist[i] = 1
histy = zip(hist.keys(), hist.values())
histy = list(histy)
histy.sort(key = lambda x: x[1], reverse=True)
histy[:20]

[(764, 7183),
 (262, 5349),
 (705, 2446),
 (286, 2327),
 (257, 2215),
 (290, 1638),
 (339, 1594),
 (284, 1328),
 (287, 1212),
 (345, 1185),
 (340, 1110),
 (264, 1082),
 (465, 1066),
 (220, 1032),
 (373, 971),
 (1312, 864),
 (1339, 845),
 (326, 720),
 (531, 665),
 (607, 641)]

GPT From Scratch¶

Generating Parameters¶

Updating Parameters¶

Utility Functions¶

GELU¶

Softmax¶

Layer Normalization¶

Linear Combination¶

FFN¶

Attention¶

Simple Attention¶

Multi-Head Attention¶

Transformer Block¶

GPT¶

Generation¶

Training Data¶

Training¶

Testing It Out¶

Looking At The Vocabulary¶