{"id":"3ZBHcfa32y","url":"https://pastebin.ca/3ZBHcfa32y","raw_url":"https://raw.anybin.ca/3ZBHcfa32y","visibility":"public","access":"public","created_at":1780157725039,"expires_at":null,"fetch_limit":null,"fetches_used":0,"reads_remaining":null,"size_bytes":20010,"syntax_hint":"hew","title":"MicroGPT in Hew v0.3","filename":"microgpt-v03.hew","change_note":null,"cipher":null,"cipher_meta":null,"parent_id":null,"root_id":"3ZBHcfa32y","version":1,"owner_id":"06F6D6Y710X9HQV90HF93KWQT8","recipient_id":null,"body":"// microgpt.hew — MicroGPT in pure Hew (v0.3 dialect)\n// Port of Karpathy's microgpt.py — a complete GPT language model.\n// Trains on a list of names, then generates new ones.\n//\n// Architecture:\n// - Tape-based autograd with reverse-mode differentiation\n// - GPT-2 style: token/position embeddings, multi-head attention, RMSNorm, MLP\n// - Adam optimizer with bias correction and linear LR decay\n// - Uses math.* (LLVM intrinsics) and random.* (MT19937) from Hew stdlib\n\n// ============================================================\n// Hyperparameters\n// ============================================================\n\nconst N_EMBD: Int = 16;\nconst BLOCK_SIZE: Int = 16;\nconst N_HEAD: Int = 4;\nconst HEAD_DIM: Int = 4;    // N_EMBD / N_HEAD\nconst NUM_STEPS: Int = 200;\n\n// ============================================================\n// SECTION 1: Tape-based Autograd\n// ============================================================\n// Every \"Value\" is an Int index into parallel arrays.\n// Operations append to the tape and record the DAG for backward().\n\n// A Val is a tape index — an Int that refers to a node in the computation graph.\ntype Val = Int;\n\n// Tape storage — passed as a struct to avoid globals\ntype Tape {\n    data: Vec<f64>;      // forward values\n    grad: Vec<f64>;      // gradient accumulators\n    left: Vec<Int>;      // left child index (-1 = none)\n    right: Vec<Int>;     // right child index (-1 = none)\n    lgrad: Vec<f64>;     // local gradient for left child\n    rgrad: Vec<f64>;     // local gradient for right child\n}\n\nfn tape_new() -> Tape {\n    let d: Vec<f64> = Vec::new();\n    let g: Vec<f64> = Vec::new();\n    let l: Vec<Int> = Vec::new();\n    let r: Vec<Int> = Vec::new();\n    let lg: Vec<f64> = Vec::new();\n    let rg: Vec<f64> = Vec::new();\n    Tape {\n        data: d,\n        grad: g,\n        left: l,\n        right: r,\n        lgrad: lg,\n        rgrad: rg\n    }\n}\n\nimpl Tape {\n\n// Create a new leaf value on the tape. Returns its index.\nfn val(t: Tape, d: f64) -> Val {\n    let idx = t.data.len();\n    t.data.push(d);\n    t.grad.push(0.0);\n    t.left.push(-1);\n    t.right.push(-1);\n    t.lgrad.push(0.0);\n    t.rgrad.push(0.0);\n    idx\n}\n\n// Get data of a tape value\nfn vd(t: Tape, i: Val) -> f64 { t.data[i] }\n\n// Get grad of a tape value\nfn vg(t: Tape, i: Val) -> f64 { t.grad[i] }\n\n// a + b\nfn vadd(t: Tape, a: Val, b: Val) -> Val {\n    let idx = t.data.len();\n    t.data.push(t.vd(a) + t.vd(b));\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(b);\n    t.lgrad.push(1.0);\n    t.rgrad.push(1.0);\n    idx\n}\n\n// a * b\nfn vmul(t: Tape, a: Val, b: Val) -> Val {\n    let idx = t.data.len();\n    t.data.push(t.vd(a) * t.vd(b));\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(b);\n    t.lgrad.push(t.vd(b));  // d/da(a*b) = b\n    t.rgrad.push(t.vd(a));  // d/db(a*b) = a\n    idx\n}\n\n// a ** c (c is a float constant, not a tape value)\nfn vpow(t: Tape, a: Val, c: f64) -> Val {\n    let ad = t.vd(a);\n    let idx = t.data.len();\n    t.data.push(math.pow(ad, c));\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(-1);\n    t.lgrad.push(c * math.pow(ad, c - 1.0));  // d/da(a^c) = c*a^(c-1)\n    t.rgrad.push(0.0);\n    idx\n}\n\n// log(a)\nfn vlog(t: Tape, a: Val) -> Val {\n    let ad = t.vd(a);\n    let idx = t.data.len();\n    t.data.push(math.log(ad));\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(-1);\n    t.lgrad.push(1.0 / ad);  // d/da(ln(a)) = 1/a\n    t.rgrad.push(0.0);\n    idx\n}\n\n// exp(a)\nfn vexp(t: Tape, a: Val) -> Val {\n    let ad = t.vd(a);\n    let ev = math.exp(ad);\n    let idx = t.data.len();\n    t.data.push(ev);\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(-1);\n    t.lgrad.push(ev);  // d/da(e^a) = e^a\n    t.rgrad.push(0.0);\n    idx\n}\n\n// relu(a)\nfn vrelu(t: Tape, a: Val) -> Val {\n    let ad = t.vd(a);\n    let idx = t.data.len();\n    t.data.push(math.max(0.0, ad));\n    t.grad.push(0.0);\n    t.left.push(a);\n    t.right.push(-1);\n    if ad > 0.0 { t.lgrad.push(1.0); } else { t.lgrad.push(0.0); }\n    t.rgrad.push(0.0);\n    idx\n}\n\n// -a (negate)\nfn vneg(t: Tape, a: Val) -> Val {\n    let neg1 = t.val(-1.0);\n    t.vmul(a, neg1)\n}\n\n// a - b\nfn vsub(t: Tape, a: Val, b: Val) -> Val {\n    t.vadd(a, t.vneg(b))\n}\n\n// a / b = a * b^(-1)\nfn vdiv(t: Tape, a: Val, b: Val) -> Val {\n    t.vmul(a, t.vpow(b, -1.0))\n}\n\n// Multiply tape value by a float constant: a * c\nfn vmul_const(t: Tape, a: Val, c: f64) -> Val {\n    let cv = t.val(c);\n    t.vmul(a, cv)\n}\n\n// Add float constant: a + c\nfn vadd_const(t: Tape, a: Val, c: f64) -> Val {\n    let cv = t.val(c);\n    t.vadd(a, cv)\n}\n\n// Sum a list of tape values\nfn vsum(t: Tape, indices: Vec<Val>) -> Val {\n    let n = indices.len();\n    if n == 0 { return t.val(0.0); }\n    var acc = indices[0];\n    var i = 1;\n    while i < n { acc = t.vadd(acc, indices[i]); i += 1; }\n    acc\n}\n\n} // impl Tape\n\n// Set data of a tape value (standalone: impl method receivers cannot be var)\nfn vsd(var t: Tape, i: Val, d: f64) { t.data[i] = d; }\n\n// Backward pass: compute gradients for all tape values\nfn backward(var t: Tape, loss_idx: Val) {\n    // Topological sort is implicit: tape indices are already in topo order\n    // (every value depends only on values with smaller indices)\n    t.grad[loss_idx] = 1.0;\n\n    // Walk backward through tape\n    var i = loss_idx;\n    while i >= 0 {\n        let g = t.grad[i];\n        let li = t.left[i];\n        let ri = t.right[i];\n        if li >= 0 {\n            t.grad[li] += t.lgrad[i] * g;\n        }\n        if ri >= 0 {\n            t.grad[ri] += t.rgrad[i] * g;\n        }\n        i -= 1;\n    }\n}\n\n// ============================================================\n// SECTION 2: Matrix operations on tape values\n// ============================================================\n// A \"matrix\" is a flat Vec<Val> of tape indices, with rows*cols layout.\n// We store dimensions separately.\n\n// Create a matrix of random values on the tape\nfn mat_rand(t: Tape, rows: Int, cols: Int, std: f64) -> Vec<Val> {\n    let m: Vec<Val> = Vec::new();\n    for r in 0..rows {\n        for c in 0..cols {\n            let v = random.gauss(0.0, std);\n            m.push(t.val(v));\n        }\n    }\n    m\n}\n\n// Linear transform: y = W @ x, where W is [nout x nin], x is Vec<Val> of length nin\n// Returns Vec<Val> of length nout\nfn linear(t: Tape, w: Vec<Val>, nout: Int, x: Vec<Val>) -> Vec<Val> {\n    let nin = x.len();\n    let result: Vec<Val> = Vec::new();\n    for r in 0..nout {\n        // Dot product of row r of W with x\n        let temps: Vec<Val> = Vec::new();\n        for c in 0..nin {\n            let wi = w[r * nin + c];\n            let xi = x[c];\n            temps.push(t.vmul(wi, xi));\n        }\n        result.push(t.vsum(temps));\n    }\n    result\n}\n\n// Softmax on Vec<Val> of tape values. Returns Vec<Val> (new tape values = probs).\nfn softmax(t: Tape, logits: Vec<Val>) -> Vec<Val> {\n    let n = logits.len();\n    // Find max for numerical stability\n    var max_val = t.vd(logits[0]);\n    var i = 1;\n    while i < n {\n        let v = t.vd(logits[i]);\n        if v > max_val { max_val = v; }\n        i += 1;\n    }\n\n    // exp(logit - max)\n    let exps: Vec<Val> = Vec::new();\n    for i in 0..n {\n        let shifted = t.vsub(logits[i], t.val(max_val));\n        exps.push(t.vexp(shifted));\n    }\n\n    // sum of exps\n    let total = t.vsum(exps);\n\n    // divide each by total\n    let probs: Vec<Val> = Vec::new();\n    for i in 0..n { probs.push(t.vdiv(exps[i], total)); }\n    probs\n}\n\n// RMSNorm on Vec<Val>. Returns Vec<Val> of same length.\nfn rmsnorm(t: Tape, x: Vec<Val>) -> Vec<Val> {\n    let n = x.len();\n    // ms = sum(xi^2) / n\n    let sq_terms: Vec<Val> = Vec::new();\n    for i in 0..n { sq_terms.push(t.vmul(x[i], x[i])); }\n    let ms_sum = t.vsum(sq_terms);\n    let fn_val = n.to_f64();\n    let ms = t.vmul_const(ms_sum, 1.0 / fn_val);\n\n    // scale = (ms + 1e-5)^(-0.5)\n    let scale = t.vpow(t.vadd_const(ms, 1.0e-5), -0.5);\n\n    // xi * scale\n    let result: Vec<Val> = Vec::new();\n    for i in 0..n { result.push(t.vmul(x[i], scale)); }\n    result\n}\n\n// ============================================================\n// SECTION 3: Helpers, Model struct, GPT forward\n// ============================================================\n\n// Copy a Vec<Val> of tape indices (shallow copy of index values)\nfn vec_copy(src: Vec<Val>) -> Vec<Val> {\n    let dst: Vec<Val> = Vec::new();\n    for i in 0..src.len() { dst.push(src[i]); }\n    dst\n}\n\n// Initialize a KV cache of given size filled with zeros\nfn init_kv_cache(size: Int) -> Vec<Val> {\n    let cache: Vec<Val> = Vec::new();\n    for i in 0..size { cache.push(0); }\n    cache\n}\n\n// Tokenize a document: [BOS] + char tokens + [BOS]\nfn tokenize_doc(raw: String, start: Int, len: Int, char_to_tok: HashMap<String, Int>, bos: Int) -> Vec<Int> {\n    let tokens: Vec<Int> = Vec::new();\n    tokens.push(bos);\n    for i in 0..len {\n        let ch = raw.slice(start + i, start + i + 1);\n        match char_to_tok.get(ch) {\n            Some(tok) => tokens.push(tok),\n            None => {},\n        }\n    }\n    tokens.push(bos);\n    tokens\n}\n\n// All weight matrices for the 1-layer GPT\ntype Model {\n    wte: Vec<Val>;\n    wpe: Vec<Val>;\n    lm_head: Vec<Val>;\n    attn_wq: Vec<Val>;\n    attn_wk: Vec<Val>;\n    attn_wv: Vec<Val>;\n    attn_wo: Vec<Val>;\n    mlp_fc1: Vec<Val>;\n    mlp_fc2: Vec<Val>;\n}\n\n// Run one GPT forward step. Returns logits as Vec<Val>.\nfn gpt_forward(\n    tape: Tape, model: Model,\n    token_id: Int, pos_id: Int,\n    var keys: Vec<Val>, var vals: Vec<Val>,\n    vocab_size: Int\n) -> Vec<Val> {\n    // Token + position embedding\n    let x: Vec<Val> = Vec::new();\n    for d in 0..N_EMBD {\n        let tok_emb = model.wte[token_id * N_EMBD + d];\n        let pos_emb = model.wpe[pos_id * N_EMBD + d];\n        x.push(tape.vadd(tok_emb, pos_emb));\n    }\n\n    // RMSNorm\n    let x_normed = rmsnorm(tape, x);\n\n    // --- Attention block ---\n    let x_residual = vec_copy(x_normed);\n    let x_pre = rmsnorm(tape, x_normed);\n\n    let q = linear(tape, model.attn_wq, N_EMBD, x_pre);\n    let k = linear(tape, model.attn_wk, N_EMBD, x_pre);\n    let v = linear(tape, model.attn_wv, N_EMBD, x_pre);\n\n    // Store K, V in cache\n    for d in 0..N_EMBD {\n        keys[pos_id * N_EMBD + d] = k[d];\n        vals[pos_id * N_EMBD + d] = v[d];\n    }\n    let nc = pos_id + 1;\n\n    // Multi-head attention\n    let x_attn: Vec<Val> = Vec::new();\n    for h in 0..N_HEAD {\n        let hs = h * HEAD_DIM;\n        let attn_logits: Vec<Val> = Vec::new();\n        for tt in 0..nc {\n            let dot_terms: Vec<Val> = Vec::new();\n            for jj in 0..HEAD_DIM {\n                dot_terms.push(tape.vmul(q[hs + jj], keys[tt * N_EMBD + hs + jj]));\n            }\n            let dot = tape.vsum(dot_terms);\n            attn_logits.push(tape.vmul_const(dot, 1.0 / math.sqrt(HEAD_DIM.to_f64())));\n        }\n        let attn_weights = softmax(tape, attn_logits);\n        for jj in 0..HEAD_DIM {\n            let weighted_terms: Vec<Val> = Vec::new();\n            for tt in 0..nc {\n                weighted_terms.push(tape.vmul(attn_weights[tt], vals[tt * N_EMBD + hs + jj]));\n            }\n            x_attn.push(tape.vsum(weighted_terms));\n        }\n    }\n\n    // Output projection + residual\n    let x_proj = linear(tape, model.attn_wo, N_EMBD, x_attn);\n    let x_after_attn: Vec<Val> = Vec::new();\n    for d in 0..N_EMBD {\n        x_after_attn.push(tape.vadd(x_proj[d], x_residual[d]));\n    }\n\n    // --- MLP block ---\n    let x_res2 = vec_copy(x_after_attn);\n    let x_mlp_in = rmsnorm(tape, x_after_attn);\n    let x_fc1 = linear(tape, model.mlp_fc1, 4 * N_EMBD, x_mlp_in);\n    let x_relu: Vec<Val> = Vec::new();\n    for d in 0..(4 * N_EMBD) { x_relu.push(tape.vrelu(x_fc1[d])); }\n    let x_fc2 = linear(tape, model.mlp_fc2, N_EMBD, x_relu);\n    let x_final: Vec<Val> = Vec::new();\n    for d in 0..N_EMBD {\n        x_final.push(tape.vadd(x_fc2[d], x_res2[d]));\n    }\n\n    // Logits\n    linear(tape, model.lm_head, vocab_size, x_final)\n}\n\n// ============================================================\n// SECTION 4: Main — Dataset, Training, Inference\n// ============================================================\n\nfn main() -> Int {\n    // --- Load dataset ---\n    println(\"Loading dataset...\");\n    let raw = read_file(\"input.txt\");\n    let raw_len = raw.len();\n    println(f\"File length: {raw_len}\");\n\n    // Parse raw text into lines (one document per line)\n    var doc_starts: Vec<Int> = Vec::new();\n    let doc_lens: Vec<Int> = Vec::new();\n    doc_starts.push(0);\n    var pos = 0;\n    while pos < raw_len {\n        if string_char_at(raw, pos) == '\\n' {\n            let start = doc_starts[doc_starts.len() - 1];\n            let doc_len = pos - start;\n            if doc_len > 0 {\n                doc_lens.push(doc_len);\n                doc_starts.push(pos + 1);\n            } else {\n                doc_starts[doc_starts.len() - 1] = pos + 1;\n            }\n        }\n        pos += 1;\n    }\n    let last_start = doc_starts[doc_starts.len() - 1];\n    if last_start < raw_len {\n        doc_lens.push(raw_len - last_start);\n    }\n    let num_docs = doc_lens.len();\n    println(f\"num docs: {num_docs}\");\n\n    // Build sorted vocabulary of unique characters\n    var uchars: Vec<String> = Vec::new();   // unique chars as 1-char strings\n\n    for di in 0..num_docs {\n        let ds = doc_starts[di];\n        let dl = doc_lens[di];\n        for cj in 0..dl {\n            let ch = raw.slice(ds + cj, ds + cj + 1);\n            // Check if ch is already in uchars\n            var found = false;\n            var uk = 0;\n            while uk < uchars.len() {\n                if uchars[uk] == ch {\n                    found = true;\n                    break;\n                }\n                uk += 1;\n            }\n            if !found {\n                uchars.push(ch);\n            }\n        }\n    }\n\n    // Sort uchars by char code (insertion sort)\n    var si2 = 1;\n    while si2 < uchars.len() {\n        let key = uchars[si2].clone();\n        let key_code = string_char_at(key, 0);\n        var j2 = si2 - 1;\n        while j2 >= 0 {\n            let a_code = string_char_at(uchars[j2], 0);\n            if a_code > key_code {\n                uchars[j2 + 1] = uchars[j2].clone();\n                j2 -= 1;\n            } else {\n                break;\n            }\n        }\n        uchars[j2 + 1] = key;\n        si2 += 1;\n    }\n\n    // Build char-to-token lookup\n    let char_to_tok: HashMap<String, Int> = HashMap::new();\n    for i in 0..uchars.len() {\n        char_to_tok.insert(uchars[i], i);\n    }\n\n    let bos = uchars.len();  // BOS token id\n    let vocab_size = uchars.len() + 1;\n    println(f\"vocab size: {vocab_size}\");\n\n    // --- Shuffle document indices ---\n    let doc_order: Vec<Int> = Vec::new();\n    for di in 0..num_docs { doc_order.push(di); }\n\n    random.seed(42);\n    random.shuffle(doc_order);\n\n    // --- Initialize model parameters ---\n    println(\"Initializing parameters...\");\n    let t = tape_new();\n\n    let model = Model {\n        wte: mat_rand(t, vocab_size, N_EMBD, 0.08),\n        wpe: mat_rand(t, BLOCK_SIZE, N_EMBD, 0.08),\n        lm_head: mat_rand(t, vocab_size, N_EMBD, 0.08),\n        attn_wq: mat_rand(t, N_EMBD, N_EMBD, 0.08),\n        attn_wk: mat_rand(t, N_EMBD, N_EMBD, 0.08),\n        attn_wv: mat_rand(t, N_EMBD, N_EMBD, 0.08),\n        attn_wo: mat_rand(t, N_EMBD, N_EMBD, 0.08),\n        mlp_fc1: mat_rand(t, 4 * N_EMBD, N_EMBD, 0.08),\n        mlp_fc2: mat_rand(t, N_EMBD, 4 * N_EMBD, 0.08),\n    };\n\n    // Record parameter count for Adam optimizer\n    let num_params = t.data.len();\n    println(f\"num params: {num_params}\");\n\n    // Adam optimizer buffers (plain f64 arrays, not on tape)\n    var adam_m: Vec<f64> = Vec::new();\n    var adam_v: Vec<f64> = Vec::new();\n    for pi in 0..num_params { adam_m.push(0.0); adam_v.push(0.0); }\n\n    let learning_rate = 0.01;\n    let beta1 = 0.85;\n    let beta2 = 0.99;\n    let eps_adam = 1.0e-8;\n\n    // --- Training loop ---\n    // 200 steps for decent training. Use 1000 for full training (matches Python).\n    println(f\"Training for {NUM_STEPS} steps...\");\n\n    for step in 0..NUM_STEPS {\n        // Pick document\n        let doc_idx = doc_order[step % num_docs];\n        let ds = doc_starts[doc_idx];\n        let dl = doc_lens[doc_idx];\n\n        // Tokenize: [BOS] + chars + [BOS]\n        let tokens = tokenize_doc(raw, ds, dl, char_to_tok, bos);\n\n        var n = tokens.len() - 1;\n        if n > BLOCK_SIZE { n = BLOCK_SIZE; }\n\n        // Fresh tape per step: copy parameter values, discard prior computation graph\n        let step_tape = tape_new();\n        // Copy parameter values into new tape\n        for cp in 0..num_params {\n            step_tape.val(t.vd(cp));\n        }\n\n        // KV cache: keys_flat[pos * N_EMBD + d] = tape index for key vector\n        let keys_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);\n        let vals_flat = init_kv_cache(BLOCK_SIZE * N_EMBD);\n\n        // Forward pass: process each position\n        let loss_terms: Vec<Val> = Vec::new();\n        for pos_id in 0..n {\n            let token_id = tokens[pos_id];\n            let target_id = tokens[pos_id + 1];\n\n            let logits = gpt_forward(step_tape, model, token_id, pos_id, keys_flat, vals_flat, vocab_size);\n\n            // Loss: -log(softmax(logits)[target])\n            let probs = softmax(step_tape, logits);\n            let loss_t = step_tape.vneg(step_tape.vlog(probs[target_id]));\n            loss_terms.push(loss_t);\n        }\n\n        // Average loss\n        let loss_sum = step_tape.vsum(loss_terms);\n        let fn_n = n.to_f64();\n        let loss = step_tape.vmul_const(loss_sum, 1.0 / fn_n);\n\n        // Backward\n        backward(step_tape, loss);\n\n        // Adam update\n        let step_f = (step + 1).to_f64();\n        let lr_t = learning_rate * (1.0 - step_f / NUM_STEPS.to_f64());\n        for pi in 0..num_params {\n            let g = step_tape.vg(pi);\n            adam_m[pi] = beta1 * adam_m[pi] + (1.0 - beta1) * g;\n            adam_v[pi] = beta2 * adam_v[pi] + (1.0 - beta2) * g * g;\n            let m_hat = adam_m[pi] / (1.0 - math.pow(beta1, step_f));\n            let v_hat = adam_v[pi] / (1.0 - math.pow(beta2, step_f));\n            let new_data = t.vd(pi) - lr_t * m_hat / (math.sqrt(v_hat) + eps_adam);\n            vsd(t, pi, new_data);\n        }\n\n        let loss_val = step_tape.vd(loss);\n        println(f\"step {step + 1} / {NUM_STEPS} | loss {loss_val}\");\n    }\n\n    // --- Inference ---\n    let temperature = 0.5;\n    println(\"--- inference (new, hallucinated names) ---\");\n\n    for sample_idx in 0..20 {\n        // Fresh tape for inference (copy params)\n        let inf_tape = tape_new();\n        for ip in 0..num_params { inf_tape.val(t.vd(ip)); }\n\n        // Fresh KV cache\n        let inf_keys = init_kv_cache(BLOCK_SIZE * N_EMBD);\n        let inf_vals = init_kv_cache(BLOCK_SIZE * N_EMBD);\n\n        var token_id = bos;\n        var sample = \"\";\n        var gen_pos = 0;\n        while gen_pos < BLOCK_SIZE {\n            let logits = gpt_forward(inf_tape, model, token_id, gen_pos, inf_keys, inf_vals, vocab_size);\n\n            // Apply temperature\n            let scaled: Vec<Val> = Vec::new();\n            for d in 0..vocab_size {\n                scaled.push(inf_tape.vmul_const(logits[d], 1.0 / temperature));\n            }\n            let probs = softmax(inf_tape, scaled);\n\n            // Sample next token\n            let cum_weights: Vec<f64> = Vec::new();\n            var cum_total = 0.0;\n            for d in 0..vocab_size {\n                cum_total += inf_tape.vd(probs[d]);\n                cum_weights.push(cum_total);\n            }\n\n            token_id = random.choices(cum_weights, cum_total, vocab_size);\n\n            if token_id == bos {\n                // End of sequence\n                break;\n            } else {\n                // Append character from vocab\n                let ch_str = uchars[token_id];\n                sample = sample + ch_str;\n                gen_pos += 1;\n            }\n        }\n\n        println(f\"sample {sample_idx + 1}: {sample}\");\n    }\n\n    0\n}"}