from functools import cache

from random import randint

import pickle

import jax
import jax.numpy as jnp
from jax import jit
import numpy as np

from matplotlib import pyplot as plt

def make_vocab(text):
    vocab = list(set(text))
    return vocab

def token_to_one_hot(token: int, vocab_size: int):
    z = np.zeros([vocab_size])
    z[token] = 1.0
    return jnp.array(z)

def load_text(path: str = "./text8"):
    with open(path, 'r') as opn:
        l = opn.read()
    return l.split()

sample_data = load_text()
sample_data = sample_data[:int(0.1*len(sample_data))]
sample_vocab = make_vocab(sample_data)

@cache
def tokenize_word(word):
    return sample_vocab.index(word)

def tokenize_data(input_data):
    for i in range(len(input_data)):
        if i % 1700520 == 0:
            print(f"{i}/1700520")
        input_data[i] = tokenize_word(input_data[i])
    return input_data

tokenized_text = tokenize_data(sample_data)

for i in sample_data[0:20]:
    print(sample_vocab[i], end=' ')

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english

def create_word_frequency_count(input_data, vocab):
    # How far we'll look to the left/right of the word for other words
    window_size = 5

    # Let's start by making an empty list of dictionaries. If we used a n_vocab by n_vocab matrix
    # we'd run out of memory instantly.
    word_frequencies = []
    for _ in range(len(vocab)):
        word_frequencies.append({})

    for i in range(window_size+1, len(input_data) - window_size-1):
        token = input_data[i]
        for j in range(-window_size, window_size+1):
            if j == 0: continue
                
            # Get the token of the words in our window
            neighboring_word = input_data[i + j]

            # In Python, it's faster to try and crash-out than to check first
            # isn't that funny?
            try:
                word_frequencies[token][neighboring_word] += 1.0
            except KeyError as e:
                word_frequencies[token][neighboring_word] = 1.0

    return word_frequencies

def softmax(x):
    e_x = jnp.exp(x - jnp.max(x))
    return e_x / e_x.sum(axis=0)

def dict_to_arr(d: dict, vocab_size: int):
    z = np.zeros([vocab_size])
    for i in d:
        z[i] = d[i]
    z = softmax(z)
    return z

def batch_data(word_frequencies: dict, vocab_size: int,  batch_size=10):
    batches = []
    batch = []
    batch_tokens = []
    for i in range(1, len(word_frequencies)):
        batch.append(dict_to_arr(word_frequencies[i], vocab_size))
        batch_tokens.append(token_to_one_hot(i, vocab_size))
        if i % batch_size == 0:
            batches.append([
                jnp.vstack(batch_tokens),
                jnp.vstack(batch)
            ])
            batch = []
            batch_tokens = []
    return batches

word_frequency_count = create_word_frequency_count(sample_data, sample_vocab)

# RECLAIM WHAT WAS ONCE OURS (RAM mostly)
del sample_data

batched_sample_data = batch_data(word_frequency_count, len(sample_vocab))

An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.

batched_sample_data[0][1].shape

(10, 70889)

def forward(one_hot_encoded_inputs: jnp.array, embeddings: jnp.array, weights: jnp.array):
    x = jnp.dot(one_hot_encoded_inputs, embeddings)
    x = jnp.dot(x, weights)
    return softmax(x)

def gen_params(vocab_size: int, embedding_size: int):
    embeddings = np.random.uniform(-1.0, 1.0, size=[vocab_size, embedding_size])
    weights = np.random.uniform(-1.0, 1.0, size=[embedding_size, vocab_size])
    return {"weights": weights, "embeddings": embeddings}

def update_params(p1, grad_p1, lr=0.01):
    for param in p1:
        p1[param] -= lr * grad_p1[param]
    return p1

forward(batched_sample_data[0][0], test_params["embeddings"], test_params["weights"]).shape

(10, 70889)

def batch_loss(params: dict, batch_x: jnp.array, batch_y: jnp.array):
    prediction = forward(batch_x, params["embeddings"], params["weights"])
    se = jnp.sum((batch_y - prediction)**2, axis=1)
    mse = jnp.mean(se)
    return mse

batch_loss(test_params, batched_sample_data[0][0], batched_sample_data[0][1])

Array(3982.1328, dtype=float32)

batch_loss_grad = jax.grad(batch_loss)

test_g = batch_loss_grad(test_params, batched_sample_data[0][0], batched_sample_data[0][1])

plt.imshow(np.array(test_g['embeddings'][1].reshape([10, 10])))
plt.show()

def batch_train(params: dict, batches: list, epochs=10):
    for _ in range(epochs):
        for i, batch in enumerate(batches):            
            batch_tokens = batch[0]
            batch_prob_dist = batch[1]
            g = batch_loss_grad(params, batch_tokens, batch_prob_dist)
            update_params(params, g, lr=0.01)

batch_train(test_params, batched_sample_data, epochs=20)

batch_loss(test_params, batched_sample_data[0][0], batched_sample_data[0][1])

Array(722.1161, dtype=float32)

def embed_word(word):
    return test_params["embeddings"][sample_vocab.index(word)]

def unembed_word(vector):
    return sample_vocab[np.argmax(np.dot(vector, test_params["embeddings"].T))]

man = embed_word("man")
apple = embed_word("apple")
woman = embed_word("woman")
dog = embed_word("dog")
cat = embed_word("cat")
computer = embed_word("computer")
science = embed_word("science")
chemistry = embed_word("chemistry")
physics = embed_word("physics")

def cosine_similarity(A, B):
    return np.dot(A,B)/(np.linalg.norm(A)*np.linalg.norm(B))

def test(a, b):
    return cosine_similarity(a, b) * 100

test(chemistry, physics)

21.926777064800262

test(apple, computer)

-1.7913002520799637

test(man, cat)

-0.5196941550821066

test(woman, cat)

-0.5642959382385015

test(man, dog)

-11.501768976449966

test(dog, woman)

-4.738159850239754

test(woman, man)

-2.9907846823334694

test(cat, dog)

3.7614338099956512

test(apple, man)

-0.04189200117252767

w1, w2 = "big", "small"
v1 = embed_word(w1)/np.linalg.norm(embed_word(w1))
v2 = embed_word(w2)/np.linalg.norm(embed_word(w2))
np.linalg.norm(v1 - v2)

1.427628

Word Embeddings¶

Loading Training Data¶

Pre-Processing¶

Forwards Pass¶

Parameters¶

Loss¶

Train¶

Playing Around¶