← volver a la publicación›Editar / bifurcar

MicroGPT in Hew v0.3

#3ZBHcfa32ypublic / publicnueva versión

por @slepp creado 2 months ago sin caducidad 19.5 KB sintaxis: hew

Tus cambios crean una nueva publicación enlazada a esta — la original no se toca.

nueva versión Tus cambios crean una nueva publicación enlazada a esta — la original no se toca.

Visibilidad Feed público Acceso public

Nota de cambio (opcional)

Esta publicación aparecerá en el feed público. Cambia Visibilidad si solo quieres compartirla por enlace.

Cancelar

// microgpt.hew — MicroGPT in pure Hew (v0.3 dialect)
// Port of Karpathy's microgpt.py — a complete GPT language model.
// Trains on a list of names, then generates new ones.
//
// Architecture:
// - Tape-based autograd with reverse-mode differentiation
// - GPT-2 style: token/position embeddings, multi-head attention, RMSNorm, MLP
// - Adam optimizer with bias correction and linear LR decay
// - Uses math.* (LLVM intrinsics) and random.* (MT19937) from Hew stdlib

// ============================================================
// Hyperparameters
// ============================================================

const N_EMBD: Int = 16;
const BLOCK_SIZE: Int = 16;
const N_HEAD: Int = 4;
const HEAD_DIM: Int = 4;    // N_EMBD / N_HEAD
const NUM_STEPS: Int = 200;

// ============================================================
// SECTION 1: Tape-based Autograd
// ============================================================
// Every "Value" is an Int index into parallel arrays.
// Operations append to the tape and record the DAG for backward().

// A Val is a tape index — an Int that refers to a node in the computation graph.
type Val = Int;

// Tape storage — passed as a struct to avoid globals
type Tape {
    data: Vec<f64>;      // forward values
    grad: Vec<f64>;      // gradient accumulators
    left: Vec<Int>;      // left child index (-1 = none)
    right: Vec<Int>;     // right child index (-1 = none)
    lgrad: Vec<f64>;     // local gradient for left child
    rgrad: Vec<f64>;     // local gradient for right child
}

fn tape_new() -> Tape {
    let d: Vec<f64> = Vec::new();
    let g: Vec<f64> = Vec::new();
    let l: Vec<Int> = Vec::new();
    let r: Vec<Int> = Vec::new();
    let lg: Vec<f64> = Vec::new();
    let rg: Vec<f64> = Vec::new();
    Tape {
        data: d,
        grad: g,
        left: l,
        right: r,
        lgrad: lg,
        rgrad: rg
    }
}

impl Tape {

// Create a new leaf value on the tape. Returns its index.
fn val(t: Tape, d: f64) -> Val {
    let idx = t.data.len();
    t.data.push(d);
    t.grad.push(0.0);
    t.left.push(-1);
    t.right.push(-1);
    t.lgrad.push(0.0);
    t.rgrad.push(0.0);
    idx
}

// Get data of a tape value
fn vd(t: Tape, i: Val) -> f64 { t.data[i] }

// Get grad of a tape value
fn vg(t: Tape, i: Val) -> f64 { t.grad[i] }

// a + b
fn vadd(t: Tape, a: Val, b: Val) -> Val {
    let idx = t.data.len();
    t.data.push(t.vd(a) + t.vd(b));
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(b);
    t.lgrad.push(1.0);
    t.rgrad.push(1.0);
    idx
}

// a * b
fn vmul(t: Tape, a: Val, b: Val) -> Val {
    let idx = t.data.len();
    t.data.push(t.vd(a) * t.vd(b));
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(b);
    t.lgrad.push(t.vd(b));  // d/da(a*b) = b
    t.rgrad.push(t.vd(a));  // d/db(a*b) = a
    idx
}

// a ** c (c is a float constant, not a tape value)
fn vpow(t: Tape, a: Val, c: f64) -> Val {
    let ad = t.vd(a);
    let idx = t.data.len();
    t.data.push(math.pow(ad, c));
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(-1);
    t.lgrad.push(c * math.pow(ad, c - 1.0));  // d/da(a^c) = c*a^(c-1)
    t.rgrad.push(0.0);
    idx
}

// log(a)
fn vlog(t: Tape, a: Val) -> Val {
    let ad = t.vd(a);
    let idx = t.data.len();
    t.data.push(math.log(ad));
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(-1);
    t.lgrad.push(1.0 / ad);  // d/da(ln(a)) = 1/a
    t.rgrad.push(0.0);
    idx
}

// exp(a)
fn vexp(t: Tape, a: Val) -> Val {
    let ad = t.vd(a);
    let ev = math.exp(ad);
    let idx = t.data.len();
    t.data.push(ev);
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(-1);
    t.lgrad.push(ev);  // d/da(e^a) = e^a
    t.rgrad.push(0.0);
    idx
}

// relu(a)
fn vrelu(t: Tape, a: Val) -> Val {
    let ad = t.vd(a);
    let idx = t.data.len();
    t.data.push(math.max(0.0, ad));
    t.grad.push(0.0);
    t.left.push(a);
    t.right.push(-1);
    if ad > 0.0 { t.lgrad.push(1.0); } else { t.lgrad.push(0.0); }
    t.rgrad.push(0.0);
    idx
}

// -a (negate)
fn vneg(t: Tape, a: Val) -> Val {
    let neg1 = t.val(-1.0);
    t.vmul(a, neg1)
}

// a - b
fn vsub(t: Tape, a: Val, b: Val) -> Val {
    t.vadd(a, t.vneg(b))
}

// a / b = a * b^(-1)
fn vdiv(t: Tape, a: Val, b: Val) -> Val {
    t.vmul(a, t.vpow(b, -1.0))
}

// Multiply tape value by a float constant: a * c
fn vmul_const(t: Tape, a: Val, c: f64) -> Val {
    let cv = t.val(c);
    t.vmul(a, cv)
}

// Add float constant: a + c
fn vadd_const(t: Tape, a: Val, c: f64) -> Val {
    let cv = t.val(c);
    t.vadd(a, cv)
}

// Sum a list of tape values
fn vsum(t: Tape, indices: Vec<Val>) -> Val {
    let n = indices.len();
    if n == 0 { return t.val(0.0); }
    var acc = indices[0];
    var i = 1;
    while i < n { acc = t.vadd(acc, indices[i]); i += 1; }
    acc
}

} // impl Tape

// Set data of a tape value (standalone: impl method receivers cannot be var)
fn vsd(var t: Tape, i: Val, d: f64) { t.data[i] = d; }

// Backward pass: compute gradients for all tape values
fn backward(var t: Tape, loss_idx: Val) {
    // Topological sort is implicit: tape indices are already in topo order
    // (every value depends only on values with smaller indices)
    t.grad[loss_idx] = 1.0;

// Walk backward through tape
    var i = loss_idx;
    while i >= 0 {
        let g = t.grad[i];
        let li = t.left[i];
        let ri = t.right[i];
        if li >= 0 {
            t.grad[li] += t.lgrad[i] * g;
        }
        if ri >= 0 {
            t.grad[ri] += t.rgrad[i] * g;
        }
        i -= 1;
    }
}

// ============================================================
// SECTION 2: Matrix operations on tape values
// ============================================================
// A "matrix" is a flat Vec<Val> of tape indices, with rows*cols layout.
// We store dimensions separately.

// Create a matrix of random values on the tape
fn mat_rand(t: Tape, rows: Int, cols: Int, std: f64) -> Vec<Val> {
    let m: Vec<Val> = Vec::new();
    for r in 0..rows {
        for c in 0..cols {
            let v = random.gauss(0.0, std);
            m.push(t.val(v));
        }
    }
    m
}

// Linear transform: y = W @ x, where W is [nout x nin], x is Vec<Val> of length nin
// Returns Vec<Val> of length nout
fn linear(t: Tape, w: Vec<Val>, nout: Int, x: Vec<Val>) -> Vec<Val> {
    let nin = x.len();
    let result: Vec<Val> = Vec::new();
    for r in 0..nout {
        // Dot product of row r of W with x
        let temps: Vec<Val> = Vec::new();
        for c in 0..nin {
            let wi = w[r * nin + c];
            let xi = x[c];
            temps.push(t.vmul(wi, xi));
        }
        result.push(t.vsum(temps));
    }
    result
}

// Softmax on Vec<Val> of tape values. Returns Vec<Val> (new tape values = probs).
fn softmax(t: Tape, logits: Vec<Val>) -> Vec<Val> {
    let n = logits.len();
    // Find max for numerical stability
    var max_val = t.vd(logits[0]);
    var i = 1;
    while i < n {
        let v = t.vd(logits[i]);
        if v > max_val { max_val = v; }
        i += 1;
    }

// exp(logit - max)
    let exps: Vec<Val> = Vec::new();
    for i in 0..n {
        let shifted = t.vsub(logits[i], t.val(max_val));
        exps.push(t.vexp(shifted));
    }

// sum of exps
    let total = t.vsum(exps);

// divide each by total
    let probs: Vec<Val> = Vec::new();
    for i in 0..n { probs.push(t.vdiv(exps[i], total)); }
    probs
}

// RMSNorm on Vec<Val>. Returns Vec<Val> of same length.
fn rmsnorm(t: Tape, x: Vec<Val>) -> Vec<Val> {
    let n = x.len();
    // ms = sum(xi^2) / n
    let sq_terms: Vec<Val> = Vec::new();
    for i in 0..n { sq_terms.push(t.vmul(x[i], x[i])); }
    let ms_sum = t.vsum(sq_terms);
    let fn_val = n.to_f64();
    let ms = t.vmul_const(ms_sum, 1.0 / fn_val);

// scale = (ms + 1e-5)^(-0.5)
    let scale = t.vpow(t.vadd_const(ms, 1.0e-5), -0.5);

// xi * scale
    let result: Vec<Val> = Vec::new();
    for i in 0..n { result.push(t.vmul(x[i], scale)); }
    result
}

// ============================================================
// SECTION 3: Helpers, Model struct, GPT forward
// ============================================================

// Copy a Vec<Val> of tape indices (shallow copy of index values)
fn vec_copy(src: Vec<Val>) -> Vec<Val> {
    let dst: Vec<Val> = Vec::new();
    for i in 0..src.len() { dst.push(src[i]); }
    dst
}

// Initialize a KV cache of given size filled with zeros
fn init_kv_cache(size: Int) -> Vec<Val> {
    let cache: Vec<Val> = Vec::new();
    for i in 0..size { cache.push(0); }
    cache
}

// Tokenize a document: [BOS] + char tokens + [BOS]
fn tokenize_doc(raw: String, start: Int, len: Int, char_to_tok: HashMap<String, Int>, bos: Int) -> Vec<Int> {
    let tokens: Vec<Int> = Vec::new();
    tokens.push(bos);
    for i in 0..len {
        let ch = raw.slice(start + i, start + i + 1);
        match char_to_tok.get(ch) {
            Some(tok) => tokens.push(tok),
            None => {},
        }
    }
    tokens.push(bos);
    tokens
}

// All weight matrices for the 1-layer GPT
type Model {
    wte: Vec<Val>;
    wpe: Vec<Val>;
    lm_head: Vec<Val>;
    attn_wq: Vec<Val>;
    attn_wk: Vec<Val>;
    attn_wv: Vec<Val>;
    attn_wo: Vec<Val>;
    mlp_fc1: Vec<Val>;
    mlp_fc2: Vec<Val>;
}

// Run one GPT forward step. Returns logits as Vec<Val>.
fn gpt_forward(
    tape: Tape, model: Model,
    token_id: Int, pos_id: Int,
    var keys: Vec<Val>, var vals: Vec<Val>,
    vocab_size: Int
) -> Vec<Val> {
    // Token + position embedding
    let x: Vec<Val> = Vec::new();
    for d in 0..N_EMBD {
        let tok_emb = model.wte[token_id * N_EMBD + d];
        let pos_emb = model.wpe[pos_id * N_EMBD + d];
        x.push(tape.vadd(tok_emb, pos_emb));
    }

// RMSNorm
    let x_normed = rmsnorm(tape, x);

// --- Attention block ---
    let x_residual = vec_copy(x_normed);
    let x_pre = rmsnorm(tape, x_normed);

let q = linear(tape, model.attn_wq, N_EMBD, x_pre);
    let k = linear(tape, model.attn_wk, N_EMBD, x_pre);
    let v = linear(tape, model.attn_wv, N_EMBD, x_pre);

// Store K, V in cache
    for d in 0..N_EMBD {
        keys[pos_id * N_EMBD + d] = k[d];
        vals[pos_id * N_EMBD + d] = v[d];
    }
    let nc = pos_id + 1;

// Multi-head attention
    let x_attn: Vec<Val> = Vec::new();
    for h in 0..N_HEAD {
        let hs = h * HEAD_DIM;
        let attn_logits: Vec<Val> = Vec::new();
        for tt in 0..nc {
            let dot_terms: Vec<Val> = Vec::new();
            for jj in 0..HEAD_DIM {
                dot_terms.push(tape.vmul(q[hs + jj], keys[tt * N_EMBD + hs + jj]));
            }
            let dot = tape.vsum(dot_terms);
            attn_logits.push(tape.vmul_const(dot, 1.0 / math.sqrt(HEAD_DIM.to_f64())));
        }
        let attn_weights = softmax(tape, attn_logits);
        for jj in 0..HEAD_DIM {
            let weighted_terms: Vec<Val> = Vec::new();
            for tt in 0..nc {
                weighted_terms.push(tape.vmul(attn_weights[tt], vals[tt * N_EMBD + hs + jj]));
            }
            x_attn.push(tape.vsum(weighted_terms));
        }
    }

// Output projection + residual
    let x_proj = linear(tape, model.attn_wo, N_EMBD, x_attn);
    let x_after_attn: Vec<Val> = Vec::new();
    for d in 0..N_EMBD {
        x_after_attn.push(tape.vadd(x_proj[d], x_residual[d]));
    }

// --- MLP block ---
    let x_res2 = vec_copy(x_after_attn);
    let x_mlp_in = rmsnorm(tape, x_after_attn);
    let x_fc1 = linear(tape, model.mlp_fc1, 4 * N_EMBD, x_mlp_in);
    let x_relu: Vec<Val> = Vec::new();
    for d in 0..(4 * N_EMBD) { x_relu.push(tape.vrelu(x_fc1[d])); }
    let x_fc2 = linear(tape, model.mlp_fc2, N_EMBD, x_relu);
    let x_final: Vec<Val> = Vec::new();
    for d in 0..N_EMBD {
        x_final.push(tape.vadd(x_fc2[d], x_res2[d]));
    }

// Logits
    linear(tape, model.lm_head, vocab_size, x_final)
}

// ============================================================
// SECTION 4: Main — Dataset, Training, Inference
// ============================================================

fn main() -> Int {
    // --- Load dataset ---
    println("Loading dataset...");
    let raw = read_file("input.txt");
    let raw_len = raw.len();
    println(f"File length: {raw_len}");

// Parse raw text into lines (one document per line)
    var doc_starts: Vec<Int> = Vec::new();
    let doc_lens: Vec<Int> = Vec::new();
    doc_starts.push(0);
    var pos = 0;
    while pos < raw_len {
        if string_char_at(raw, pos) == '\n' {
            let start = doc_starts[doc_starts.len() - 1];
            let doc_len = pos - start;
            if doc_len > 0 {
                doc_lens.push(doc_len);
                doc_starts.push(pos + 1);
            } else {
                doc_starts[doc_starts.len() - 1] = pos + 1;
            }
        }
        pos += 1;
    }
    let last_start = doc_starts[doc_starts.len() - 1];
    if last_start < raw_len {
        doc_lens.push(raw_len - last_start);
    }
    let num_docs = doc_lens.len();
    println(f"num docs: {num_docs}");

// Build sorted vocabulary of unique characters
    var uchars: Vec<String> = Vec::new();   // unique chars as 1-char strings

for di in 0..num_docs {
        let ds = doc_starts[di];
        let dl = doc_lens[di];
        for cj in 0..dl {
            let ch = raw.slice(ds + cj, ds + cj + 1);
            // Check if ch is already in uchars
            var found = false;
            var uk = 0;
            while uk < uchars.len() {
                if uchars[uk] == ch {
                    found = true;
                    break;
                }
                uk += 1;
            }
            if !found {
                uchars.push(ch);
            }
        }
    }

// Sort uchars by char code (insertion sort)
    var si2 = 1;
    while si2 < uchars.len() {
        let key = uchars[si2].clone();
        let key_code = string_char_at(key, 0);
        var j2 = si2 - 1;
        while j2 >= 0 {
            let a_code = string_char_at(uchars[j2], 0);
            if a_code > key_code {
                uchars[j2 + 1] = uchars[j2].clone();
                j2 -= 1;
            } else {
                break;
            }
        }
        uchars[j2 + 1] = key;
        si2 += 1;
    }

// Build char-to-token lookup
    let char_to_tok: HashMap<String, Int> = HashMap::new();
    for i in 0..uchars.len() {
        char_to_tok.insert(uchars[i], i);
    }

let bos = uchars.len();  // BOS token id
    let vocab_size = uchars.len() + 1;
    println(f"vocab size: {vocab_size}");

// --- Shuffle document indices ---
    let doc_order: Vec<Int> = Vec::new();
    for di in 0..num_docs { doc_order.push(di); }

random.seed(42);
    random.shuffle(doc_order);

// --- Initialize model parameters ---
    println("Initializing parameters...");
    let t = tape_new();

let model = Model {
        wte: mat_rand(t, vocab_size, N_EMBD, 0.08),
        wpe: mat_rand(t, BLOCK_SIZE, N_EMBD, 0.08),
        lm_head: mat_rand(t, vocab_size, N_EMBD, 0.08),
        attn_wq: mat_rand(t, N_EMBD, N_EMBD, 0.08),
        attn_wk: mat_rand(t, N_EMBD, N_EMBD, 0.08),
        attn_wv: mat_rand(t, N_EMBD, N_EMBD, 0.08),
        attn_wo: mat_rand(t, N_EMBD, N_EMBD, 0.08),
        mlp_fc1: mat_rand(t, 4 * N_EMBD, N_EMBD, 0.08),
        mlp_fc2: mat_rand(t, N_EMBD, 4 * N_EMBD, 0.08),
    };

// Record parameter count for Adam optimizer
    let num_params = t.data.len();
    println(f"num params: {num_params}");

// Adam optimizer buffers (plain f64 arrays, not on tape)
    var adam_m: Vec<f64> = Vec::new();
    var adam_v: Vec<f64> = Vec::new();
    for pi in 0..num_params { adam_m.push(0.0); adam_v.push(0.0); }

let learning_rate = 0.01;
    let beta1 = 0.85;
    let beta2 = 0.99;
    let eps_adam = 1.0e-8;

// --- Training loop ---
    // 200 steps for decent training. Use 1000 for full training (matches Python).
    println(f"Training for {NUM_STEPS} steps...");

for step in 0..NUM_STEPS {
        // Pick document
        let doc_idx = doc_order[step % num_docs];
        let ds = doc_starts[doc_idx];
        let dl = doc_lens[doc_idx];

// Tokenize: [BOS] + chars + [BOS]
        let tokens = tokenize_doc(raw, ds, dl, char_to_tok, bos);

var n = tokens.len() - 1;
        if n > BLOCK_SIZE { n = BLOCK_SIZE; }

// Fresh tape per step: copy parameter values, discard prior computation graph
        let step_tape = tape_new();
        // Copy parameter values into new tape
        for cp in 0..num_params {
            step_tape.val(t.vd(cp));
        }

// KV cache: keys_flat[pos * N_EMBD + d] = tape index for key vector
        let keys_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);
        let vals_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);

// Forward pass: process each position
        let loss_terms: Vec<Val> = Vec::new();
        for pos_id in 0..n {
            let token_id = tokens[pos_id];
            let target_id = tokens[pos_id + 1];

let logits = gpt_forward(step_tape, model, token_id, pos_id, keys_flat, vals_flat, vocab_size);

// Loss: -log(softmax(logits)[target])
            let probs = softmax(step_tape, logits);
            let loss_t = step_tape.vneg(step_tape.vlog(probs[target_id]));
            loss_terms.push(loss_t);
        }

// Average loss
        let loss_sum = step_tape.vsum(loss_terms);
        let fn_n = n.to_f64();
        let loss = step_tape.vmul_const(loss_sum, 1.0 / fn_n);

// Backward
        backward(step_tape, loss);

// Adam update
        let step_f = (step + 1).to_f64();
        let lr_t = learning_rate * (1.0 - step_f / NUM_STEPS.to_f64());
        for pi in 0..num_params {
            let g = step_tape.vg(pi);
            adam_m[pi] = beta1 * adam_m[pi] + (1.0 - beta1) * g;
            adam_v[pi] = beta2 * adam_v[pi] + (1.0 - beta2) * g * g;
            let m_hat = adam_m[pi] / (1.0 - math.pow(beta1, step_f));
            let v_hat = adam_v[pi] / (1.0 - math.pow(beta2, step_f));
            let new_data = t.vd(pi) - lr_t * m_hat / (math.sqrt(v_hat) + eps_adam);
            vsd(t, pi, new_data);
        }

let loss_val = step_tape.vd(loss);
        println(f"step {step + 1} / {NUM_STEPS} | loss {loss_val}");
    }

// --- Inference ---
    let temperature = 0.5;
    println("--- inference (new, hallucinated names) ---");

for sample_idx in 0..20 {
        // Fresh tape for inference (copy params)
        let inf_tape = tape_new();
        for ip in 0..num_params { inf_tape.val(t.vd(ip)); }

// Fresh KV cache
        let inf_keys = init_kv_cache(BLOCK_SIZE * N_EMBD);
        let inf_vals = init_kv_cache(BLOCK_SIZE * N_EMBD);

var token_id = bos;
        var sample = "";
        var gen_pos = 0;
        while gen_pos < BLOCK_SIZE {
            let logits = gpt_forward(inf_tape, model, token_id, gen_pos, inf_keys, inf_vals, vocab_size);

// Apply temperature
            let scaled: Vec<Val> = Vec::new();
            for d in 0..vocab_size {
                scaled.push(inf_tape.vmul_const(logits[d], 1.0 / temperature));
            }
            let probs = softmax(inf_tape, scaled);

// Sample next token
            let cum_weights: Vec<f64> = Vec::new();
            var cum_total = 0.0;
            for d in 0..vocab_size {
                cum_total += inf_tape.vd(probs[d]);
                cum_weights.push(cum_total);
            }

token_id = random.choices(cum_weights, cum_total, vocab_size);

if token_id == bos {
                // End of sequence
                break;
            } else {
                // Append character from vocab
                let ch_str = uchars[token_id];
                sample = sample + ch_str;
                gen_pos += 1;
            }
        }

println(f"sample {sample_idx + 1}: {sample}");
    }

0
}