MicroGPT in Hew v0.3

public hew v1 · immutable
#3ZBHcfa32y
·published 2026-05-30 16:15 UTC ·by slepp
// microgpt.hew — MicroGPT in pure Hew (v0.3 dialect)// Port of Karpathy's microgpt.py — a complete GPT language model.// Trains on a list of names, then generates new ones.//// Architecture:// - Tape-based autograd with reverse-mode differentiation// - GPT-2 style: token/position embeddings, multi-head attention, RMSNorm, MLP// - Adam optimizer with bias correction and linear LR decay// - Uses math.* (LLVM intrinsics) and random.* (MT19937) from Hew stdlib// ============================================================// Hyperparameters// ============================================================const N_EMBD: Int = 16;const BLOCK_SIZE: Int = 16;const N_HEAD: Int = 4;const HEAD_DIM: Int = 4;    // N_EMBD / N_HEADconst NUM_STEPS: Int = 200;// ============================================================// SECTION 1: Tape-based Autograd// ============================================================// Every "Value" is an Int index into parallel arrays.// Operations append to the tape and record the DAG for backward().// A Val is a tape index — an Int that refers to a node in the computation graph.type Val = Int;// Tape storage — passed as a struct to avoid globalstype Tape {    data: Vec<f64>;      // forward values    grad: Vec<f64>;      // gradient accumulators    left: Vec<Int>;      // left child index (-1 = none)    right: Vec<Int>;     // right child index (-1 = none)    lgrad: Vec<f64>;     // local gradient for left child    rgrad: Vec<f64>;     // local gradient for right child}fn tape_new() -> Tape {    let d: Vec<f64> = Vec::new();    let g: Vec<f64> = Vec::new();    let l: Vec<Int> = Vec::new();    let r: Vec<Int> = Vec::new();    let lg: Vec<f64> = Vec::new();    let rg: Vec<f64> = Vec::new();    Tape {        data: d,        grad: g,        left: l,        right: r,        lgrad: lg,        rgrad: rg    }}impl Tape {// Create a new leaf value on the tape. Returns its index.fn val(t: Tape, d: f64) -> Val {    let idx = t.data.len();    t.data.push(d);    t.grad.push(0.0);    t.left.push(-1);    t.right.push(-1);    t.lgrad.push(0.0);    t.rgrad.push(0.0);    idx}// Get data of a tape valuefn vd(t: Tape, i: Val) -> f64 { t.data[i] }// Get grad of a tape valuefn vg(t: Tape, i: Val) -> f64 { t.grad[i] }// a + bfn vadd(t: Tape, a: Val, b: Val) -> Val {    let idx = t.data.len();    t.data.push(t.vd(a) + t.vd(b));    t.grad.push(0.0);    t.left.push(a);    t.right.push(b);    t.lgrad.push(1.0);    t.rgrad.push(1.0);    idx}// a * bfn vmul(t: Tape, a: Val, b: Val) -> Val {    let idx = t.data.len();    t.data.push(t.vd(a) * t.vd(b));    t.grad.push(0.0);    t.left.push(a);    t.right.push(b);    t.lgrad.push(t.vd(b));  // d/da(a*b) = b    t.rgrad.push(t.vd(a));  // d/db(a*b) = a    idx}// a ** c (c is a float constant, not a tape value)fn vpow(t: Tape, a: Val, c: f64) -> Val {    let ad = t.vd(a);    let idx = t.data.len();    t.data.push(math.pow(ad, c));    t.grad.push(0.0);    t.left.push(a);    t.right.push(-1);    t.lgrad.push(c * math.pow(ad, c - 1.0));  // d/da(a^c) = c*a^(c-1)    t.rgrad.push(0.0);    idx}// log(a)fn vlog(t: Tape, a: Val) -> Val {    let ad = t.vd(a);    let idx = t.data.len();    t.data.push(math.log(ad));    t.grad.push(0.0);    t.left.push(a);    t.right.push(-1);    t.lgrad.push(1.0 / ad);  // d/da(ln(a)) = 1/a    t.rgrad.push(0.0);    idx}// exp(a)fn vexp(t: Tape, a: Val) -> Val {    let ad = t.vd(a);    let ev = math.exp(ad);    let idx = t.data.len();    t.data.push(ev);    t.grad.push(0.0);    t.left.push(a);    t.right.push(-1);    t.lgrad.push(ev);  // d/da(e^a) = e^a    t.rgrad.push(0.0);    idx}// relu(a)fn vrelu(t: Tape, a: Val) -> Val {    let ad = t.vd(a);    let idx = t.data.len();    t.data.push(math.max(0.0, ad));    t.grad.push(0.0);    t.left.push(a);    t.right.push(-1);    if ad > 0.0 { t.lgrad.push(1.0); } else { t.lgrad.push(0.0); }    t.rgrad.push(0.0);    idx}// -a (negate)fn vneg(t: Tape, a: Val) -> Val {    let neg1 = t.val(-1.0);    t.vmul(a, neg1)}// a - bfn vsub(t: Tape, a: Val, b: Val) -> Val {    t.vadd(a, t.vneg(b))}// a / b = a * b^(-1)fn vdiv(t: Tape, a: Val, b: Val) -> Val {    t.vmul(a, t.vpow(b, -1.0))}// Multiply tape value by a float constant: a * cfn vmul_const(t: Tape, a: Val, c: f64) -> Val {    let cv = t.val(c);    t.vmul(a, cv)}// Add float constant: a + cfn vadd_const(t: Tape, a: Val, c: f64) -> Val {    let cv = t.val(c);    t.vadd(a, cv)}// Sum a list of tape valuesfn vsum(t: Tape, indices: Vec<Val>) -> Val {    let n = indices.len();    if n == 0 { return t.val(0.0); }    var acc = indices[0];    var i = 1;    while i < n { acc = t.vadd(acc, indices[i]); i += 1; }    acc}} // impl Tape// Set data of a tape value (standalone: impl method receivers cannot be var)fn vsd(var t: Tape, i: Val, d: f64) { t.data[i] = d; }// Backward pass: compute gradients for all tape valuesfn backward(var t: Tape, loss_idx: Val) {    // Topological sort is implicit: tape indices are already in topo order    // (every value depends only on values with smaller indices)    t.grad[loss_idx] = 1.0;    // Walk backward through tape    var i = loss_idx;    while i >= 0 {        let g = t.grad[i];        let li = t.left[i];        let ri = t.right[i];        if li >= 0 {            t.grad[li] += t.lgrad[i] * g;        }        if ri >= 0 {            t.grad[ri] += t.rgrad[i] * g;        }        i -= 1;    }}// ============================================================// SECTION 2: Matrix operations on tape values// ============================================================// A "matrix" is a flat Vec<Val> of tape indices, with rows*cols layout.// We store dimensions separately.// Create a matrix of random values on the tapefn mat_rand(t: Tape, rows: Int, cols: Int, std: f64) -> Vec<Val> {    let m: Vec<Val> = Vec::new();    for r in 0..rows {        for c in 0..cols {            let v = random.gauss(0.0, std);            m.push(t.val(v));        }    }    m}// Linear transform: y = W @ x, where W is [nout x nin], x is Vec<Val> of length nin// Returns Vec<Val> of length noutfn linear(t: Tape, w: Vec<Val>, nout: Int, x: Vec<Val>) -> Vec<Val> {    let nin = x.len();    let result: Vec<Val> = Vec::new();    for r in 0..nout {        // Dot product of row r of W with x        let temps: Vec<Val> = Vec::new();        for c in 0..nin {            let wi = w[r * nin + c];            let xi = x[c];            temps.push(t.vmul(wi, xi));        }        result.push(t.vsum(temps));    }    result}// Softmax on Vec<Val> of tape values. Returns Vec<Val> (new tape values = probs).fn softmax(t: Tape, logits: Vec<Val>) -> Vec<Val> {    let n = logits.len();    // Find max for numerical stability    var max_val = t.vd(logits[0]);    var i = 1;    while i < n {        let v = t.vd(logits[i]);        if v > max_val { max_val = v; }        i += 1;    }    // exp(logit - max)    let exps: Vec<Val> = Vec::new();    for i in 0..n {        let shifted = t.vsub(logits[i], t.val(max_val));        exps.push(t.vexp(shifted));    }    // sum of exps    let total = t.vsum(exps);    // divide each by total    let probs: Vec<Val> = Vec::new();    for i in 0..n { probs.push(t.vdiv(exps[i], total)); }    probs}// RMSNorm on Vec<Val>. Returns Vec<Val> of same length.fn rmsnorm(t: Tape, x: Vec<Val>) -> Vec<Val> {    let n = x.len();    // ms = sum(xi^2) / n    let sq_terms: Vec<Val> = Vec::new();    for i in 0..n { sq_terms.push(t.vmul(x[i], x[i])); }    let ms_sum = t.vsum(sq_terms);    let fn_val = n.to_f64();    let ms = t.vmul_const(ms_sum, 1.0 / fn_val);    // scale = (ms + 1e-5)^(-0.5)    let scale = t.vpow(t.vadd_const(ms, 1.0e-5), -0.5);    // xi * scale    let result: Vec<Val> = Vec::new();    for i in 0..n { result.push(t.vmul(x[i], scale)); }    result}// ============================================================// SECTION 3: Helpers, Model struct, GPT forward// ============================================================// Copy a Vec<Val> of tape indices (shallow copy of index values)fn vec_copy(src: Vec<Val>) -> Vec<Val> {    let dst: Vec<Val> = Vec::new();    for i in 0..src.len() { dst.push(src[i]); }    dst}// Initialize a KV cache of given size filled with zerosfn init_kv_cache(size: Int) -> Vec<Val> {    let cache: Vec<Val> = Vec::new();    for i in 0..size { cache.push(0); }    cache}// Tokenize a document: [BOS] + char tokens + [BOS]fn tokenize_doc(raw: String, start: Int, len: Int, char_to_tok: HashMap<String, Int>, bos: Int) -> Vec<Int> {    let tokens: Vec<Int> = Vec::new();    tokens.push(bos);    for i in 0..len {        let ch = raw.slice(start + i, start + i + 1);        match char_to_tok.get(ch) {            Some(tok) => tokens.push(tok),            None => {},        }    }    tokens.push(bos);    tokens}// All weight matrices for the 1-layer GPTtype Model {    wte: Vec<Val>;    wpe: Vec<Val>;    lm_head: Vec<Val>;    attn_wq: Vec<Val>;    attn_wk: Vec<Val>;    attn_wv: Vec<Val>;    attn_wo: Vec<Val>;    mlp_fc1: Vec<Val>;    mlp_fc2: Vec<Val>;}// Run one GPT forward step. Returns logits as Vec<Val>.fn gpt_forward(    tape: Tape, model: Model,    token_id: Int, pos_id: Int,    var keys: Vec<Val>, var vals: Vec<Val>,    vocab_size: Int) -> Vec<Val> {    // Token + position embedding    let x: Vec<Val> = Vec::new();    for d in 0..N_EMBD {        let tok_emb = model.wte[token_id * N_EMBD + d];        let pos_emb = model.wpe[pos_id * N_EMBD + d];        x.push(tape.vadd(tok_emb, pos_emb));    }    // RMSNorm    let x_normed = rmsnorm(tape, x);    // --- Attention block ---    let x_residual = vec_copy(x_normed);    let x_pre = rmsnorm(tape, x_normed);    let q = linear(tape, model.attn_wq, N_EMBD, x_pre);    let k = linear(tape, model.attn_wk, N_EMBD, x_pre);    let v = linear(tape, model.attn_wv, N_EMBD, x_pre);    // Store K, V in cache    for d in 0..N_EMBD {        keys[pos_id * N_EMBD + d] = k[d];        vals[pos_id * N_EMBD + d] = v[d];    }    let nc = pos_id + 1;    // Multi-head attention    let x_attn: Vec<Val> = Vec::new();    for h in 0..N_HEAD {        let hs = h * HEAD_DIM;        let attn_logits: Vec<Val> = Vec::new();        for tt in 0..nc {            let dot_terms: Vec<Val> = Vec::new();            for jj in 0..HEAD_DIM {                dot_terms.push(tape.vmul(q[hs + jj], keys[tt * N_EMBD + hs + jj]));            }            let dot = tape.vsum(dot_terms);            attn_logits.push(tape.vmul_const(dot, 1.0 / math.sqrt(HEAD_DIM.to_f64())));        }        let attn_weights = softmax(tape, attn_logits);        for jj in 0..HEAD_DIM {            let weighted_terms: Vec<Val> = Vec::new();            for tt in 0..nc {                weighted_terms.push(tape.vmul(attn_weights[tt], vals[tt * N_EMBD + hs + jj]));            }            x_attn.push(tape.vsum(weighted_terms));        }    }    // Output projection + residual    let x_proj = linear(tape, model.attn_wo, N_EMBD, x_attn);    let x_after_attn: Vec<Val> = Vec::new();    for d in 0..N_EMBD {        x_after_attn.push(tape.vadd(x_proj[d], x_residual[d]));    }    // --- MLP block ---    let x_res2 = vec_copy(x_after_attn);    let x_mlp_in = rmsnorm(tape, x_after_attn);    let x_fc1 = linear(tape, model.mlp_fc1, 4 * N_EMBD, x_mlp_in);    let x_relu: Vec<Val> = Vec::new();    for d in 0..(4 * N_EMBD) { x_relu.push(tape.vrelu(x_fc1[d])); }    let x_fc2 = linear(tape, model.mlp_fc2, N_EMBD, x_relu);    let x_final: Vec<Val> = Vec::new();    for d in 0..N_EMBD {        x_final.push(tape.vadd(x_fc2[d], x_res2[d]));    }    // Logits    linear(tape, model.lm_head, vocab_size, x_final)}// ============================================================// SECTION 4: Main — Dataset, Training, Inference// ============================================================fn main() -> Int {    // --- Load dataset ---    println("Loading dataset...");    let raw = read_file("input.txt");    let raw_len = raw.len();    println(f"File length: {raw_len}");    // Parse raw text into lines (one document per line)    var doc_starts: Vec<Int> = Vec::new();    let doc_lens: Vec<Int> = Vec::new();    doc_starts.push(0);    var pos = 0;    while pos < raw_len {        if string_char_at(raw, pos) == '\n' {            let start = doc_starts[doc_starts.len() - 1];            let doc_len = pos - start;            if doc_len > 0 {                doc_lens.push(doc_len);                doc_starts.push(pos + 1);            } else {                doc_starts[doc_starts.len() - 1] = pos + 1;            }        }        pos += 1;    }    let last_start = doc_starts[doc_starts.len() - 1];    if last_start < raw_len {        doc_lens.push(raw_len - last_start);    }    let num_docs = doc_lens.len();    println(f"num docs: {num_docs}");    // Build sorted vocabulary of unique characters    var uchars: Vec<String> = Vec::new();   // unique chars as 1-char strings    for di in 0..num_docs {        let ds = doc_starts[di];        let dl = doc_lens[di];        for cj in 0..dl {            let ch = raw.slice(ds + cj, ds + cj + 1);            // Check if ch is already in uchars            var found = false;            var uk = 0;            while uk < uchars.len() {                if uchars[uk] == ch {                    found = true;                    break;                }                uk += 1;            }            if !found {                uchars.push(ch);            }        }    }    // Sort uchars by char code (insertion sort)    var si2 = 1;    while si2 < uchars.len() {        let key = uchars[si2].clone();        let key_code = string_char_at(key, 0);        var j2 = si2 - 1;        while j2 >= 0 {            let a_code = string_char_at(uchars[j2], 0);            if a_code > key_code {                uchars[j2 + 1] = uchars[j2].clone();                j2 -= 1;            } else {                break;            }        }        uchars[j2 + 1] = key;        si2 += 1;    }    // Build char-to-token lookup    let char_to_tok: HashMap<String, Int> = HashMap::new();    for i in 0..uchars.len() {        char_to_tok.insert(uchars[i], i);    }    let bos = uchars.len();  // BOS token id    let vocab_size = uchars.len() + 1;    println(f"vocab size: {vocab_size}");    // --- Shuffle document indices ---    let doc_order: Vec<Int> = Vec::new();    for di in 0..num_docs { doc_order.push(di); }    random.seed(42);    random.shuffle(doc_order);    // --- Initialize model parameters ---    println("Initializing parameters...");    let t = tape_new();    let model = Model {        wte: mat_rand(t, vocab_size, N_EMBD, 0.08),        wpe: mat_rand(t, BLOCK_SIZE, N_EMBD, 0.08),        lm_head: mat_rand(t, vocab_size, N_EMBD, 0.08),        attn_wq: mat_rand(t, N_EMBD, N_EMBD, 0.08),        attn_wk: mat_rand(t, N_EMBD, N_EMBD, 0.08),        attn_wv: mat_rand(t, N_EMBD, N_EMBD, 0.08),        attn_wo: mat_rand(t, N_EMBD, N_EMBD, 0.08),        mlp_fc1: mat_rand(t, 4 * N_EMBD, N_EMBD, 0.08),        mlp_fc2: mat_rand(t, N_EMBD, 4 * N_EMBD, 0.08),    };    // Record parameter count for Adam optimizer    let num_params = t.data.len();    println(f"num params: {num_params}");    // Adam optimizer buffers (plain f64 arrays, not on tape)    var adam_m: Vec<f64> = Vec::new();    var adam_v: Vec<f64> = Vec::new();    for pi in 0..num_params { adam_m.push(0.0); adam_v.push(0.0); }    let learning_rate = 0.01;    let beta1 = 0.85;    let beta2 = 0.99;    let eps_adam = 1.0e-8;    // --- Training loop ---    // 200 steps for decent training. Use 1000 for full training (matches Python).    println(f"Training for {NUM_STEPS} steps...");    for step in 0..NUM_STEPS {        // Pick document        let doc_idx = doc_order[step % num_docs];        let ds = doc_starts[doc_idx];        let dl = doc_lens[doc_idx];        // Tokenize: [BOS] + chars + [BOS]        let tokens = tokenize_doc(raw, ds, dl, char_to_tok, bos);        var n = tokens.len() - 1;        if n > BLOCK_SIZE { n = BLOCK_SIZE; }        // Fresh tape per step: copy parameter values, discard prior computation graph        let step_tape = tape_new();        // Copy parameter values into new tape        for cp in 0..num_params {            step_tape.val(t.vd(cp));        }        // KV cache: keys_flat[pos * N_EMBD + d] = tape index for key vector        let keys_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);        let vals_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);        // Forward pass: process each position        let loss_terms: Vec<Val> = Vec::new();        for pos_id in 0..n {            let token_id = tokens[pos_id];            let target_id = tokens[pos_id + 1];            let logits = gpt_forward(step_tape, model, token_id, pos_id, keys_flat, vals_flat, vocab_size);            // Loss: -log(softmax(logits)[target])            let probs = softmax(step_tape, logits);            let loss_t = step_tape.vneg(step_tape.vlog(probs[target_id]));            loss_terms.push(loss_t);        }        // Average loss        let loss_sum = step_tape.vsum(loss_terms);        let fn_n = n.to_f64();        let loss = step_tape.vmul_const(loss_sum, 1.0 / fn_n);        // Backward        backward(step_tape, loss);        // Adam update        let step_f = (step + 1).to_f64();        let lr_t = learning_rate * (1.0 - step_f / NUM_STEPS.to_f64());        for pi in 0..num_params {            let g = step_tape.vg(pi);            adam_m[pi] = beta1 * adam_m[pi] + (1.0 - beta1) * g;            adam_v[pi] = beta2 * adam_v[pi] + (1.0 - beta2) * g * g;            let m_hat = adam_m[pi] / (1.0 - math.pow(beta1, step_f));            let v_hat = adam_v[pi] / (1.0 - math.pow(beta2, step_f));            let new_data = t.vd(pi) - lr_t * m_hat / (math.sqrt(v_hat) + eps_adam);            vsd(t, pi, new_data);        }        let loss_val = step_tape.vd(loss);        println(f"step {step + 1} / {NUM_STEPS} | loss {loss_val}");    }    // --- Inference ---    let temperature = 0.5;    println("--- inference (new, hallucinated names) ---");    for sample_idx in 0..20 {        // Fresh tape for inference (copy params)        let inf_tape = tape_new();        for ip in 0..num_params { inf_tape.val(t.vd(ip)); }        // Fresh KV cache        let inf_keys = init_kv_cache(BLOCK_SIZE * N_EMBD);        let inf_vals = init_kv_cache(BLOCK_SIZE * N_EMBD);        var token_id = bos;        var sample = "";        var gen_pos = 0;        while gen_pos < BLOCK_SIZE {            let logits = gpt_forward(inf_tape, model, token_id, gen_pos, inf_keys, inf_vals, vocab_size);            // Apply temperature            let scaled: Vec<Val> = Vec::new();            for d in 0..vocab_size {                scaled.push(inf_tape.vmul_const(logits[d], 1.0 / temperature));            }            let probs = softmax(inf_tape, scaled);            // Sample next token            let cum_weights: Vec<f64> = Vec::new();            var cum_total = 0.0;            for d in 0..vocab_size {                cum_total += inf_tape.vd(probs[d]);                cum_weights.push(cum_total);            }            token_id = random.choices(cum_weights, cum_total, vocab_size);            if token_id == bos {                // End of sequence                break;            } else {                // Append character from vocab                let ch_str = uchars[token_id];                sample = sample + ch_str;                gen_pos += 1;            }        }        println(f"sample {sample_idx + 1}: {sample}");    }    0}