// microgpt.hew — MicroGPT in pure Hew (v0.3 dialect)// Port of Karpathy's microgpt.py — a complete GPT language model.// Trains on a list of names, then generates new ones.//// Architecture:// - Tape-based autograd with reverse-mode differentiation// - GPT-2 style: token/position embeddings, multi-head attention, RMSNorm, MLP// - Adam optimizer with bias correction and linear LR decay// - Uses math.* (LLVM intrinsics) and random.* (MT19937) from Hew stdlib// ============================================================// Hyperparameters// ============================================================const N_EMBD: Int = 16;const BLOCK_SIZE: Int = 16;const N_HEAD: Int = 4;const HEAD_DIM: Int = 4; // N_EMBD / N_HEADconst NUM_STEPS: Int = 200;// ============================================================// SECTION 1: Tape-based Autograd// ============================================================// Every "Value" is an Int index into parallel arrays.// Operations append to the tape and record the DAG for backward().// A Val is a tape index — an Int that refers to a node in the computation graph.type Val = Int;// Tape storage — passed as a struct to avoid globalstype Tape { data: Vec<f64>; // forward values grad: Vec<f64>; // gradient accumulators left: Vec<Int>; // left child index (-1 = none) right: Vec<Int>; // right child index (-1 = none) lgrad: Vec<f64>; // local gradient for left child rgrad: Vec<f64>; // local gradient for right child}fn tape_new() -> Tape { let d: Vec<f64> = Vec::new(); let g: Vec<f64> = Vec::new(); let l: Vec<Int> = Vec::new(); let r: Vec<Int> = Vec::new(); let lg: Vec<f64> = Vec::new(); let rg: Vec<f64> = Vec::new(); Tape { data: d, grad: g, left: l, right: r, lgrad: lg, rgrad: rg }}impl Tape {// Create a new leaf value on the tape. Returns its index.fn val(t: Tape, d: f64) -> Val { let idx = t.data.len(); t.data.push(d); t.grad.push(0.0); t.left.push(-1); t.right.push(-1); t.lgrad.push(0.0); t.rgrad.push(0.0); idx}// Get data of a tape valuefn vd(t: Tape, i: Val) -> f64 { t.data[i] }// Get grad of a tape valuefn vg(t: Tape, i: Val) -> f64 { t.grad[i] }// a + bfn vadd(t: Tape, a: Val, b: Val) -> Val { let idx = t.data.len(); t.data.push(t.vd(a) + t.vd(b)); t.grad.push(0.0); t.left.push(a); t.right.push(b); t.lgrad.push(1.0); t.rgrad.push(1.0); idx}// a * bfn vmul(t: Tape, a: Val, b: Val) -> Val { let idx = t.data.len(); t.data.push(t.vd(a) * t.vd(b)); t.grad.push(0.0); t.left.push(a); t.right.push(b); t.lgrad.push(t.vd(b)); // d/da(a*b) = b t.rgrad.push(t.vd(a)); // d/db(a*b) = a idx}// a ** c (c is a float constant, not a tape value)fn vpow(t: Tape, a: Val, c: f64) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.pow(ad, c)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(c * math.pow(ad, c - 1.0)); // d/da(a^c) = c*a^(c-1) t.rgrad.push(0.0); idx}// log(a)fn vlog(t: Tape, a: Val) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.log(ad)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(1.0 / ad); // d/da(ln(a)) = 1/a t.rgrad.push(0.0); idx}// exp(a)fn vexp(t: Tape, a: Val) -> Val { let ad = t.vd(a); let ev = math.exp(ad); let idx = t.data.len(); t.data.push(ev); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(ev); // d/da(e^a) = e^a t.rgrad.push(0.0); idx}// relu(a)fn vrelu(t: Tape, a: Val) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.max(0.0, ad)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); if ad > 0.0 { t.lgrad.push(1.0); } else { t.lgrad.push(0.0); } t.rgrad.push(0.0); idx}// -a (negate)fn vneg(t: Tape, a: Val) -> Val { let neg1 = t.val(-1.0); t.vmul(a, neg1)}// a - bfn vsub(t: Tape, a: Val, b: Val) -> Val { t.vadd(a, t.vneg(b))}// a / b = a * b^(-1)fn vdiv(t: Tape, a: Val, b: Val) -> Val { t.vmul(a, t.vpow(b, -1.0))}// Multiply tape value by a float constant: a * cfn vmul_const(t: Tape, a: Val, c: f64) -> Val { let cv = t.val(c); t.vmul(a, cv)}// Add float constant: a + cfn vadd_const(t: Tape, a: Val, c: f64) -> Val { let cv = t.val(c); t.vadd(a, cv)}// Sum a list of tape valuesfn vsum(t: Tape, indices: Vec<Val>) -> Val { let n = indices.len(); if n == 0 { return t.val(0.0); } var acc = indices[0]; var i = 1; while i < n { acc = t.vadd(acc, indices[i]); i += 1; } acc}} // impl Tape// Set data of a tape value (standalone: impl method receivers cannot be var)fn vsd(var t: Tape, i: Val, d: f64) { t.data[i] = d; }// Backward pass: compute gradients for all tape valuesfn backward(var t: Tape, loss_idx: Val) { // Topological sort is implicit: tape indices are already in topo order // (every value depends only on values with smaller indices) t.grad[loss_idx] = 1.0; // Walk backward through tape var i = loss_idx; while i >= 0 { let g = t.grad[i]; let li = t.left[i]; let ri = t.right[i]; if li >= 0 { t.grad[li] += t.lgrad[i] * g; } if ri >= 0 { t.grad[ri] += t.rgrad[i] * g; } i -= 1; }}// ============================================================// SECTION 2: Matrix operations on tape values// ============================================================// A "matrix" is a flat Vec<Val> of tape indices, with rows*cols layout.// We store dimensions separately.// Create a matrix of random values on the tapefn mat_rand(t: Tape, rows: Int, cols: Int, std: f64) -> Vec<Val> { let m: Vec<Val> = Vec::new(); for r in 0..rows { for c in 0..cols { let v = random.gauss(0.0, std); m.push(t.val(v)); } } m}// Linear transform: y = W @ x, where W is [nout x nin], x is Vec<Val> of length nin// Returns Vec<Val> of length noutfn linear(t: Tape, w: Vec<Val>, nout: Int, x: Vec<Val>) -> Vec<Val> { let nin = x.len(); let result: Vec<Val> = Vec::new(); for r in 0..nout { // Dot product of row r of W with x let temps: Vec<Val> = Vec::new(); for c in 0..nin { let wi = w[r * nin + c]; let xi = x[c]; temps.push(t.vmul(wi, xi)); } result.push(t.vsum(temps)); } result}// Softmax on Vec<Val> of tape values. Returns Vec<Val> (new tape values = probs).fn softmax(t: Tape, logits: Vec<Val>) -> Vec<Val> { let n = logits.len(); // Find max for numerical stability var max_val = t.vd(logits[0]); var i = 1; while i < n { let v = t.vd(logits[i]); if v > max_val { max_val = v; } i += 1; } // exp(logit - max) let exps: Vec<Val> = Vec::new(); for i in 0..n { let shifted = t.vsub(logits[i], t.val(max_val)); exps.push(t.vexp(shifted)); } // sum of exps let total = t.vsum(exps); // divide each by total let probs: Vec<Val> = Vec::new(); for i in 0..n { probs.push(t.vdiv(exps[i], total)); } probs}// RMSNorm on Vec<Val>. Returns Vec<Val> of same length.fn rmsnorm(t: Tape, x: Vec<Val>) -> Vec<Val> { let n = x.len(); // ms = sum(xi^2) / n let sq_terms: Vec<Val> = Vec::new(); for i in 0..n { sq_terms.push(t.vmul(x[i], x[i])); } let ms_sum = t.vsum(sq_terms); let fn_val = n.to_f64(); let ms = t.vmul_const(ms_sum, 1.0 / fn_val); // scale = (ms + 1e-5)^(-0.5) let scale = t.vpow(t.vadd_const(ms, 1.0e-5), -0.5); // xi * scale let result: Vec<Val> = Vec::new(); for i in 0..n { result.push(t.vmul(x[i], scale)); } result}// ============================================================// SECTION 3: Helpers, Model struct, GPT forward// ============================================================// Copy a Vec<Val> of tape indices (shallow copy of index values)fn vec_copy(src: Vec<Val>) -> Vec<Val> { let dst: Vec<Val> = Vec::new(); for i in 0..src.len() { dst.push(src[i]); } dst}// Initialize a KV cache of given size filled with zerosfn init_kv_cache(size: Int) -> Vec<Val> { let cache: Vec<Val> = Vec::new(); for i in 0..size { cache.push(0); } cache}// Tokenize a document: [BOS] + char tokens + [BOS]fn tokenize_doc(raw: String, start: Int, len: Int, char_to_tok: HashMap<String, Int>, bos: Int) -> Vec<Int> { let tokens: Vec<Int> = Vec::new(); tokens.push(bos); for i in 0..len { let ch = raw.slice(start + i, start + i + 1); match char_to_tok.get(ch) { Some(tok) => tokens.push(tok), None => {}, } } tokens.push(bos); tokens}// All weight matrices for the 1-layer GPTtype Model { wte: Vec<Val>; wpe: Vec<Val>; lm_head: Vec<Val>; attn_wq: Vec<Val>; attn_wk: Vec<Val>; attn_wv: Vec<Val>; attn_wo: Vec<Val>; mlp_fc1: Vec<Val>; mlp_fc2: Vec<Val>;}// Run one GPT forward step. Returns logits as Vec<Val>.fn gpt_forward( tape: Tape, model: Model, token_id: Int, pos_id: Int, var keys: Vec<Val>, var vals: Vec<Val>, vocab_size: Int) -> Vec<Val> { // Token + position embedding let x: Vec<Val> = Vec::new(); for d in 0..N_EMBD { let tok_emb = model.wte[token_id * N_EMBD + d]; let pos_emb = model.wpe[pos_id * N_EMBD + d]; x.push(tape.vadd(tok_emb, pos_emb)); } // RMSNorm let x_normed = rmsnorm(tape, x); // --- Attention block --- let x_residual = vec_copy(x_normed); let x_pre = rmsnorm(tape, x_normed); let q = linear(tape, model.attn_wq, N_EMBD, x_pre); let k = linear(tape, model.attn_wk, N_EMBD, x_pre); let v = linear(tape, model.attn_wv, N_EMBD, x_pre); // Store K, V in cache for d in 0..N_EMBD { keys[pos_id * N_EMBD + d] = k[d]; vals[pos_id * N_EMBD + d] = v[d]; } let nc = pos_id + 1; // Multi-head attention let x_attn: Vec<Val> = Vec::new(); for h in 0..N_HEAD { let hs = h * HEAD_DIM; let attn_logits: Vec<Val> = Vec::new(); for tt in 0..nc { let dot_terms: Vec<Val> = Vec::new(); for jj in 0..HEAD_DIM { dot_terms.push(tape.vmul(q[hs + jj], keys[tt * N_EMBD + hs + jj])); } let dot = tape.vsum(dot_terms); attn_logits.push(tape.vmul_const(dot, 1.0 / math.sqrt(HEAD_DIM.to_f64()))); } let attn_weights = softmax(tape, attn_logits); for jj in 0..HEAD_DIM { let weighted_terms: Vec<Val> = Vec::new(); for tt in 0..nc { weighted_terms.push(tape.vmul(attn_weights[tt], vals[tt * N_EMBD + hs + jj])); } x_attn.push(tape.vsum(weighted_terms)); } } // Output projection + residual let x_proj = linear(tape, model.attn_wo, N_EMBD, x_attn); let x_after_attn: Vec<Val> = Vec::new(); for d in 0..N_EMBD { x_after_attn.push(tape.vadd(x_proj[d], x_residual[d])); } // --- MLP block --- let x_res2 = vec_copy(x_after_attn); let x_mlp_in = rmsnorm(tape, x_after_attn); let x_fc1 = linear(tape, model.mlp_fc1, 4 * N_EMBD, x_mlp_in); let x_relu: Vec<Val> = Vec::new(); for d in 0..(4 * N_EMBD) { x_relu.push(tape.vrelu(x_fc1[d])); } let x_fc2 = linear(tape, model.mlp_fc2, N_EMBD, x_relu); let x_final: Vec<Val> = Vec::new(); for d in 0..N_EMBD { x_final.push(tape.vadd(x_fc2[d], x_res2[d])); } // Logits linear(tape, model.lm_head, vocab_size, x_final)}// ============================================================// SECTION 4: Main — Dataset, Training, Inference// ============================================================fn main() -> Int { // --- Load dataset --- println("Loading dataset..."); let raw = read_file("input.txt"); let raw_len = raw.len(); println(f"File length: {raw_len}"); // Parse raw text into lines (one document per line) var doc_starts: Vec<Int> = Vec::new(); let doc_lens: Vec<Int> = Vec::new(); doc_starts.push(0); var pos = 0; while pos < raw_len { if string_char_at(raw, pos) == '\n' { let start = doc_starts[doc_starts.len() - 1]; let doc_len = pos - start; if doc_len > 0 { doc_lens.push(doc_len); doc_starts.push(pos + 1); } else { doc_starts[doc_starts.len() - 1] = pos + 1; } } pos += 1; } let last_start = doc_starts[doc_starts.len() - 1]; if last_start < raw_len { doc_lens.push(raw_len - last_start); } let num_docs = doc_lens.len(); println(f"num docs: {num_docs}"); // Build sorted vocabulary of unique characters var uchars: Vec<String> = Vec::new(); // unique chars as 1-char strings for di in 0..num_docs { let ds = doc_starts[di]; let dl = doc_lens[di]; for cj in 0..dl { let ch = raw.slice(ds + cj, ds + cj + 1); // Check if ch is already in uchars var found = false; var uk = 0; while uk < uchars.len() { if uchars[uk] == ch { found = true; break; } uk += 1; } if !found { uchars.push(ch); } } } // Sort uchars by char code (insertion sort) var si2 = 1; while si2 < uchars.len() { let key = uchars[si2].clone(); let key_code = string_char_at(key, 0); var j2 = si2 - 1; while j2 >= 0 { let a_code = string_char_at(uchars[j2], 0); if a_code > key_code { uchars[j2 + 1] = uchars[j2].clone(); j2 -= 1; } else { break; } } uchars[j2 + 1] = key; si2 += 1; } // Build char-to-token lookup let char_to_tok: HashMap<String, Int> = HashMap::new(); for i in 0..uchars.len() { char_to_tok.insert(uchars[i], i); } let bos = uchars.len(); // BOS token id let vocab_size = uchars.len() + 1; println(f"vocab size: {vocab_size}"); // --- Shuffle document indices --- let doc_order: Vec<Int> = Vec::new(); for di in 0..num_docs { doc_order.push(di); } random.seed(42); random.shuffle(doc_order); // --- Initialize model parameters --- println("Initializing parameters..."); let t = tape_new(); let model = Model { wte: mat_rand(t, vocab_size, N_EMBD, 0.08), wpe: mat_rand(t, BLOCK_SIZE, N_EMBD, 0.08), lm_head: mat_rand(t, vocab_size, N_EMBD, 0.08), attn_wq: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wk: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wv: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wo: mat_rand(t, N_EMBD, N_EMBD, 0.08), mlp_fc1: mat_rand(t, 4 * N_EMBD, N_EMBD, 0.08), mlp_fc2: mat_rand(t, N_EMBD, 4 * N_EMBD, 0.08), }; // Record parameter count for Adam optimizer let num_params = t.data.len(); println(f"num params: {num_params}"); // Adam optimizer buffers (plain f64 arrays, not on tape) var adam_m: Vec<f64> = Vec::new(); var adam_v: Vec<f64> = Vec::new(); for pi in 0..num_params { adam_m.push(0.0); adam_v.push(0.0); } let learning_rate = 0.01; let beta1 = 0.85; let beta2 = 0.99; let eps_adam = 1.0e-8; // --- Training loop --- // 200 steps for decent training. Use 1000 for full training (matches Python). println(f"Training for {NUM_STEPS} steps..."); for step in 0..NUM_STEPS { // Pick document let doc_idx = doc_order[step % num_docs]; let ds = doc_starts[doc_idx]; let dl = doc_lens[doc_idx]; // Tokenize: [BOS] + chars + [BOS] let tokens = tokenize_doc(raw, ds, dl, char_to_tok, bos); var n = tokens.len() - 1; if n > BLOCK_SIZE { n = BLOCK_SIZE; } // Fresh tape per step: copy parameter values, discard prior computation graph let step_tape = tape_new(); // Copy parameter values into new tape for cp in 0..num_params { step_tape.val(t.vd(cp)); } // KV cache: keys_flat[pos * N_EMBD + d] = tape index for key vector let keys_flat = init_kv_cache(BLOCK_SIZE * N_EMBD); let vals_flat = init_kv_cache(BLOCK_SIZE * N_EMBD); // Forward pass: process each position let loss_terms: Vec<Val> = Vec::new(); for pos_id in 0..n { let token_id = tokens[pos_id]; let target_id = tokens[pos_id + 1]; let logits = gpt_forward(step_tape, model, token_id, pos_id, keys_flat, vals_flat, vocab_size); // Loss: -log(softmax(logits)[target]) let probs = softmax(step_tape, logits); let loss_t = step_tape.vneg(step_tape.vlog(probs[target_id])); loss_terms.push(loss_t); } // Average loss let loss_sum = step_tape.vsum(loss_terms); let fn_n = n.to_f64(); let loss = step_tape.vmul_const(loss_sum, 1.0 / fn_n); // Backward backward(step_tape, loss); // Adam update let step_f = (step + 1).to_f64(); let lr_t = learning_rate * (1.0 - step_f / NUM_STEPS.to_f64()); for pi in 0..num_params { let g = step_tape.vg(pi); adam_m[pi] = beta1 * adam_m[pi] + (1.0 - beta1) * g; adam_v[pi] = beta2 * adam_v[pi] + (1.0 - beta2) * g * g; let m_hat = adam_m[pi] / (1.0 - math.pow(beta1, step_f)); let v_hat = adam_v[pi] / (1.0 - math.pow(beta2, step_f)); let new_data = t.vd(pi) - lr_t * m_hat / (math.sqrt(v_hat) + eps_adam); vsd(t, pi, new_data); } let loss_val = step_tape.vd(loss); println(f"step {step + 1} / {NUM_STEPS} | loss {loss_val}"); } // --- Inference --- let temperature = 0.5; println("--- inference (new, hallucinated names) ---"); for sample_idx in 0..20 { // Fresh tape for inference (copy params) let inf_tape = tape_new(); for ip in 0..num_params { inf_tape.val(t.vd(ip)); } // Fresh KV cache let inf_keys = init_kv_cache(BLOCK_SIZE * N_EMBD); let inf_vals = init_kv_cache(BLOCK_SIZE * N_EMBD); var token_id = bos; var sample = ""; var gen_pos = 0; while gen_pos < BLOCK_SIZE { let logits = gpt_forward(inf_tape, model, token_id, gen_pos, inf_keys, inf_vals, vocab_size); // Apply temperature let scaled: Vec<Val> = Vec::new(); for d in 0..vocab_size { scaled.push(inf_tape.vmul_const(logits[d], 1.0 / temperature)); } let probs = softmax(inf_tape, scaled); // Sample next token let cum_weights: Vec<f64> = Vec::new(); var cum_total = 0.0; for d in 0..vocab_size { cum_total += inf_tape.vd(probs[d]); cum_weights.push(cum_total); } token_id = random.choices(cum_weights, cum_total, vocab_size); if token_id == bos { // End of sequence break; } else { // Append character from vocab let ch_str = uchars[token_id]; sample = sample + ch_str; gen_pos += 1; } } println(f"sample {sample_idx + 1}: {sample}"); } 0}