saltar al contenido principal
paste
bin
.ca
type · paste · share
⌘
K
Docs
Iniciar sesión
?
← volver a la publicación
›
Editar / bifurcar
MicroGPT in Hew v0.3
#3ZBHcfa32y
public / public
nueva versión
por @slepp
creado 11 days ago
sin caducidad
19.5 KB
sintaxis:
hew
Tus cambios crean una nueva publicación enlazada a esta — la original no se toca.
nueva versión
Tus cambios crean una nueva publicación enlazada a esta — la original no se toca.
Título (opcional)
Nombre de archivo
Sintaxis
hew
text
bash
c
cpp
css
diff
dockerfile
go
html
ini
java
javascript
json
kotlin
lua
makefile
markdown
nginx
php
python
ruby
rust
shellscript
sql
swift
toml
typescript
xml
yaml
Visibilidad
Feed público
Acceso
public
Caduca
7 días
10 min
1 hora
1 día
7 días
30 días
90 días
personalizada…
Caducidad personalizada
Nota de cambio
(opcional)
Esta publicación aparecerá en el feed público. Cambia Visibilidad si solo quieres compartirla por enlace.
Crear nueva versión
Cancelar
Pegar o escribir…
// microgpt.hew — MicroGPT in pure Hew (v0.3 dialect) // Port of Karpathy's microgpt.py — a complete GPT language model. // Trains on a list of names, then generates new ones. // // Architecture: // - Tape-based autograd with reverse-mode differentiation // - GPT-2 style: token/position embeddings, multi-head attention, RMSNorm, MLP // - Adam optimizer with bias correction and linear LR decay // - Uses math.* (LLVM intrinsics) and random.* (MT19937) from Hew stdlib // ============================================================ // Hyperparameters // ============================================================ const N_EMBD: Int = 16; const BLOCK_SIZE: Int = 16; const N_HEAD: Int = 4; const HEAD_DIM: Int = 4; // N_EMBD / N_HEAD const NUM_STEPS: Int = 200; // ============================================================ // SECTION 1: Tape-based Autograd // ============================================================ // Every "Value" is an Int index into parallel arrays. // Operations append to the tape and record the DAG for backward(). // A Val is a tape index — an Int that refers to a node in the computation graph. type Val = Int; // Tape storage — passed as a struct to avoid globals type Tape { data: Vec<f64>; // forward values grad: Vec<f64>; // gradient accumulators left: Vec<Int>; // left child index (-1 = none) right: Vec<Int>; // right child index (-1 = none) lgrad: Vec<f64>; // local gradient for left child rgrad: Vec<f64>; // local gradient for right child } fn tape_new() -> Tape { let d: Vec<f64> = Vec::new(); let g: Vec<f64> = Vec::new(); let l: Vec<Int> = Vec::new(); let r: Vec<Int> = Vec::new(); let lg: Vec<f64> = Vec::new(); let rg: Vec<f64> = Vec::new(); Tape { data: d, grad: g, left: l, right: r, lgrad: lg, rgrad: rg } } impl Tape { // Create a new leaf value on the tape. Returns its index. fn val(t: Tape, d: f64) -> Val { let idx = t.data.len(); t.data.push(d); t.grad.push(0.0); t.left.push(-1); t.right.push(-1); t.lgrad.push(0.0); t.rgrad.push(0.0); idx } // Get data of a tape value fn vd(t: Tape, i: Val) -> f64 { t.data[i] } // Get grad of a tape value fn vg(t: Tape, i: Val) -> f64 { t.grad[i] } // a + b fn vadd(t: Tape, a: Val, b: Val) -> Val { let idx = t.data.len(); t.data.push(t.vd(a) + t.vd(b)); t.grad.push(0.0); t.left.push(a); t.right.push(b); t.lgrad.push(1.0); t.rgrad.push(1.0); idx } // a * b fn vmul(t: Tape, a: Val, b: Val) -> Val { let idx = t.data.len(); t.data.push(t.vd(a) * t.vd(b)); t.grad.push(0.0); t.left.push(a); t.right.push(b); t.lgrad.push(t.vd(b)); // d/da(a*b) = b t.rgrad.push(t.vd(a)); // d/db(a*b) = a idx } // a ** c (c is a float constant, not a tape value) fn vpow(t: Tape, a: Val, c: f64) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.pow(ad, c)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(c * math.pow(ad, c - 1.0)); // d/da(a^c) = c*a^(c-1) t.rgrad.push(0.0); idx } // log(a) fn vlog(t: Tape, a: Val) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.log(ad)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(1.0 / ad); // d/da(ln(a)) = 1/a t.rgrad.push(0.0); idx } // exp(a) fn vexp(t: Tape, a: Val) -> Val { let ad = t.vd(a); let ev = math.exp(ad); let idx = t.data.len(); t.data.push(ev); t.grad.push(0.0); t.left.push(a); t.right.push(-1); t.lgrad.push(ev); // d/da(e^a) = e^a t.rgrad.push(0.0); idx } // relu(a) fn vrelu(t: Tape, a: Val) -> Val { let ad = t.vd(a); let idx = t.data.len(); t.data.push(math.max(0.0, ad)); t.grad.push(0.0); t.left.push(a); t.right.push(-1); if ad > 0.0 { t.lgrad.push(1.0); } else { t.lgrad.push(0.0); } t.rgrad.push(0.0); idx } // -a (negate) fn vneg(t: Tape, a: Val) -> Val { let neg1 = t.val(-1.0); t.vmul(a, neg1) } // a - b fn vsub(t: Tape, a: Val, b: Val) -> Val { t.vadd(a, t.vneg(b)) } // a / b = a * b^(-1) fn vdiv(t: Tape, a: Val, b: Val) -> Val { t.vmul(a, t.vpow(b, -1.0)) } // Multiply tape value by a float constant: a * c fn vmul_const(t: Tape, a: Val, c: f64) -> Val { let cv = t.val(c); t.vmul(a, cv) } // Add float constant: a + c fn vadd_const(t: Tape, a: Val, c: f64) -> Val { let cv = t.val(c); t.vadd(a, cv) } // Sum a list of tape values fn vsum(t: Tape, indices: Vec<Val>) -> Val { let n = indices.len(); if n == 0 { return t.val(0.0); } var acc = indices[0]; var i = 1; while i < n { acc = t.vadd(acc, indices[i]); i += 1; } acc } } // impl Tape // Set data of a tape value (standalone: impl method receivers cannot be var) fn vsd(var t: Tape, i: Val, d: f64) { t.data[i] = d; } // Backward pass: compute gradients for all tape values fn backward(var t: Tape, loss_idx: Val) { // Topological sort is implicit: tape indices are already in topo order // (every value depends only on values with smaller indices) t.grad[loss_idx] = 1.0; // Walk backward through tape var i = loss_idx; while i >= 0 { let g = t.grad[i]; let li = t.left[i]; let ri = t.right[i]; if li >= 0 { t.grad[li] += t.lgrad[i] * g; } if ri >= 0 { t.grad[ri] += t.rgrad[i] * g; } i -= 1; } } // ============================================================ // SECTION 2: Matrix operations on tape values // ============================================================ // A "matrix" is a flat Vec<Val> of tape indices, with rows*cols layout. // We store dimensions separately. // Create a matrix of random values on the tape fn mat_rand(t: Tape, rows: Int, cols: Int, std: f64) -> Vec<Val> { let m: Vec<Val> = Vec::new(); for r in 0..rows { for c in 0..cols { let v = random.gauss(0.0, std); m.push(t.val(v)); } } m } // Linear transform: y = W @ x, where W is [nout x nin], x is Vec<Val> of length nin // Returns Vec<Val> of length nout fn linear(t: Tape, w: Vec<Val>, nout: Int, x: Vec<Val>) -> Vec<Val> { let nin = x.len(); let result: Vec<Val> = Vec::new(); for r in 0..nout { // Dot product of row r of W with x let temps: Vec<Val> = Vec::new(); for c in 0..nin { let wi = w[r * nin + c]; let xi = x[c]; temps.push(t.vmul(wi, xi)); } result.push(t.vsum(temps)); } result } // Softmax on Vec<Val> of tape values. Returns Vec<Val> (new tape values = probs). fn softmax(t: Tape, logits: Vec<Val>) -> Vec<Val> { let n = logits.len(); // Find max for numerical stability var max_val = t.vd(logits[0]); var i = 1; while i < n { let v = t.vd(logits[i]); if v > max_val { max_val = v; } i += 1; } // exp(logit - max) let exps: Vec<Val> = Vec::new(); for i in 0..n { let shifted = t.vsub(logits[i], t.val(max_val)); exps.push(t.vexp(shifted)); } // sum of exps let total = t.vsum(exps); // divide each by total let probs: Vec<Val> = Vec::new(); for i in 0..n { probs.push(t.vdiv(exps[i], total)); } probs } // RMSNorm on Vec<Val>. Returns Vec<Val> of same length. fn rmsnorm(t: Tape, x: Vec<Val>) -> Vec<Val> { let n = x.len(); // ms = sum(xi^2) / n let sq_terms: Vec<Val> = Vec::new(); for i in 0..n { sq_terms.push(t.vmul(x[i], x[i])); } let ms_sum = t.vsum(sq_terms); let fn_val = n.to_f64(); let ms = t.vmul_const(ms_sum, 1.0 / fn_val); // scale = (ms + 1e-5)^(-0.5) let scale = t.vpow(t.vadd_const(ms, 1.0e-5), -0.5); // xi * scale let result: Vec<Val> = Vec::new(); for i in 0..n { result.push(t.vmul(x[i], scale)); } result } // ============================================================ // SECTION 3: Helpers, Model struct, GPT forward // ============================================================ // Copy a Vec<Val> of tape indices (shallow copy of index values) fn vec_copy(src: Vec<Val>) -> Vec<Val> { let dst: Vec<Val> = Vec::new(); for i in 0..src.len() { dst.push(src[i]); } dst } // Initialize a KV cache of given size filled with zeros fn init_kv_cache(size: Int) -> Vec<Val> { let cache: Vec<Val> = Vec::new(); for i in 0..size { cache.push(0); } cache } // Tokenize a document: [BOS] + char tokens + [BOS] fn tokenize_doc(raw: String, start: Int, len: Int, char_to_tok: HashMap<String, Int>, bos: Int) -> Vec<Int> { let tokens: Vec<Int> = Vec::new(); tokens.push(bos); for i in 0..len { let ch = raw.slice(start + i, start + i + 1); match char_to_tok.get(ch) { Some(tok) => tokens.push(tok), None => {}, } } tokens.push(bos); tokens } // All weight matrices for the 1-layer GPT type Model { wte: Vec<Val>; wpe: Vec<Val>; lm_head: Vec<Val>; attn_wq: Vec<Val>; attn_wk: Vec<Val>; attn_wv: Vec<Val>; attn_wo: Vec<Val>; mlp_fc1: Vec<Val>; mlp_fc2: Vec<Val>; } // Run one GPT forward step. Returns logits as Vec<Val>. fn gpt_forward( tape: Tape, model: Model, token_id: Int, pos_id: Int, var keys: Vec<Val>, var vals: Vec<Val>, vocab_size: Int ) -> Vec<Val> { // Token + position embedding let x: Vec<Val> = Vec::new(); for d in 0..N_EMBD { let tok_emb = model.wte[token_id * N_EMBD + d]; let pos_emb = model.wpe[pos_id * N_EMBD + d]; x.push(tape.vadd(tok_emb, pos_emb)); } // RMSNorm let x_normed = rmsnorm(tape, x); // --- Attention block --- let x_residual = vec_copy(x_normed); let x_pre = rmsnorm(tape, x_normed); let q = linear(tape, model.attn_wq, N_EMBD, x_pre); let k = linear(tape, model.attn_wk, N_EMBD, x_pre); let v = linear(tape, model.attn_wv, N_EMBD, x_pre); // Store K, V in cache for d in 0..N_EMBD { keys[pos_id * N_EMBD + d] = k[d]; vals[pos_id * N_EMBD + d] = v[d]; } let nc = pos_id + 1; // Multi-head attention let x_attn: Vec<Val> = Vec::new(); for h in 0..N_HEAD { let hs = h * HEAD_DIM; let attn_logits: Vec<Val> = Vec::new(); for tt in 0..nc { let dot_terms: Vec<Val> = Vec::new(); for jj in 0..HEAD_DIM { dot_terms.push(tape.vmul(q[hs + jj], keys[tt * N_EMBD + hs + jj])); } let dot = tape.vsum(dot_terms); attn_logits.push(tape.vmul_const(dot, 1.0 / math.sqrt(HEAD_DIM.to_f64()))); } let attn_weights = softmax(tape, attn_logits); for jj in 0..HEAD_DIM { let weighted_terms: Vec<Val> = Vec::new(); for tt in 0..nc { weighted_terms.push(tape.vmul(attn_weights[tt], vals[tt * N_EMBD + hs + jj])); } x_attn.push(tape.vsum(weighted_terms)); } } // Output projection + residual let x_proj = linear(tape, model.attn_wo, N_EMBD, x_attn); let x_after_attn: Vec<Val> = Vec::new(); for d in 0..N_EMBD { x_after_attn.push(tape.vadd(x_proj[d], x_residual[d])); } // --- MLP block --- let x_res2 = vec_copy(x_after_attn); let x_mlp_in = rmsnorm(tape, x_after_attn); let x_fc1 = linear(tape, model.mlp_fc1, 4 * N_EMBD, x_mlp_in); let x_relu: Vec<Val> = Vec::new(); for d in 0..(4 * N_EMBD) { x_relu.push(tape.vrelu(x_fc1[d])); } let x_fc2 = linear(tape, model.mlp_fc2, N_EMBD, x_relu); let x_final: Vec<Val> = Vec::new(); for d in 0..N_EMBD { x_final.push(tape.vadd(x_fc2[d], x_res2[d])); } // Logits linear(tape, model.lm_head, vocab_size, x_final) } // ============================================================ // SECTION 4: Main — Dataset, Training, Inference // ============================================================ fn main() -> Int { // --- Load dataset --- println("Loading dataset..."); let raw = read_file("input.txt"); let raw_len = raw.len(); println(f"File length: {raw_len}"); // Parse raw text into lines (one document per line) var doc_starts: Vec<Int> = Vec::new(); let doc_lens: Vec<Int> = Vec::new(); doc_starts.push(0); var pos = 0; while pos < raw_len { if string_char_at(raw, pos) == '\n' { let start = doc_starts[doc_starts.len() - 1]; let doc_len = pos - start; if doc_len > 0 { doc_lens.push(doc_len); doc_starts.push(pos + 1); } else { doc_starts[doc_starts.len() - 1] = pos + 1; } } pos += 1; } let last_start = doc_starts[doc_starts.len() - 1]; if last_start < raw_len { doc_lens.push(raw_len - last_start); } let num_docs = doc_lens.len(); println(f"num docs: {num_docs}"); // Build sorted vocabulary of unique characters var uchars: Vec<String> = Vec::new(); // unique chars as 1-char strings for di in 0..num_docs { let ds = doc_starts[di]; let dl = doc_lens[di]; for cj in 0..dl { let ch = raw.slice(ds + cj, ds + cj + 1); // Check if ch is already in uchars var found = false; var uk = 0; while uk < uchars.len() { if uchars[uk] == ch { found = true; break; } uk += 1; } if !found { uchars.push(ch); } } } // Sort uchars by char code (insertion sort) var si2 = 1; while si2 < uchars.len() { let key = uchars[si2].clone(); let key_code = string_char_at(key, 0); var j2 = si2 - 1; while j2 >= 0 { let a_code = string_char_at(uchars[j2], 0); if a_code > key_code { uchars[j2 + 1] = uchars[j2].clone(); j2 -= 1; } else { break; } } uchars[j2 + 1] = key; si2 += 1; } // Build char-to-token lookup let char_to_tok: HashMap<String, Int> = HashMap::new(); for i in 0..uchars.len() { char_to_tok.insert(uchars[i], i); } let bos = uchars.len(); // BOS token id let vocab_size = uchars.len() + 1; println(f"vocab size: {vocab_size}"); // --- Shuffle document indices --- let doc_order: Vec<Int> = Vec::new(); for di in 0..num_docs { doc_order.push(di); } random.seed(42); random.shuffle(doc_order); // --- Initialize model parameters --- println("Initializing parameters..."); let t = tape_new(); let model = Model { wte: mat_rand(t, vocab_size, N_EMBD, 0.08), wpe: mat_rand(t, BLOCK_SIZE, N_EMBD, 0.08), lm_head: mat_rand(t, vocab_size, N_EMBD, 0.08), attn_wq: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wk: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wv: mat_rand(t, N_EMBD, N_EMBD, 0.08), attn_wo: mat_rand(t, N_EMBD, N_EMBD, 0.08), mlp_fc1: mat_rand(t, 4 * N_EMBD, N_EMBD, 0.08), mlp_fc2: mat_rand(t, N_EMBD, 4 * N_EMBD, 0.08), }; // Record parameter count for Adam optimizer let num_params = t.data.len(); println(f"num params: {num_params}"); // Adam optimizer buffers (plain f64 arrays, not on tape) var adam_m: Vec<f64> = Vec::new(); var adam_v: Vec<f64> = Vec::new(); for pi in 0..num_params { adam_m.push(0.0); adam_v.push(0.0); } let learning_rate = 0.01; let beta1 = 0.85; let beta2 = 0.99; let eps_adam = 1.0e-8; // --- Training loop --- // 200 steps for decent training. Use 1000 for full training (matches Python). println(f"Training for {NUM_STEPS} steps..."); for step in 0..NUM_STEPS { // Pick document let doc_idx = doc_order[step % num_docs]; let ds = doc_starts[doc_idx]; let dl = doc_lens[doc_idx]; // Tokenize: [BOS] + chars + [BOS] let tokens = tokenize_doc(raw, ds, dl, char_to_tok, bos); var n = tokens.len() - 1; if n > BLOCK_SIZE { n = BLOCK_SIZE; } // Fresh tape per step: copy parameter values, discard prior computation graph let step_tape = tape_new(); // Copy parameter values into new tape for cp in 0..num_params { step_tape.val(t.vd(cp)); } // KV cache: keys_flat[pos * N_EMBD + d] = tape index for key vector let keys_flat = init_kv_cache(BLOCK_SIZE * N_EMBD); let vals_flat = init_kv_cache(BLOCK_SIZE * N_EMBD); // Forward pass: process each position let loss_terms: Vec<Val> = Vec::new(); for pos_id in 0..n { let token_id = tokens[pos_id]; let target_id = tokens[pos_id + 1]; let logits = gpt_forward(step_tape, model, token_id, pos_id, keys_flat, vals_flat, vocab_size); // Loss: -log(softmax(logits)[target]) let probs = softmax(step_tape, logits); let loss_t = step_tape.vneg(step_tape.vlog(probs[target_id])); loss_terms.push(loss_t); } // Average loss let loss_sum = step_tape.vsum(loss_terms); let fn_n = n.to_f64(); let loss = step_tape.vmul_const(loss_sum, 1.0 / fn_n); // Backward backward(step_tape, loss); // Adam update let step_f = (step + 1).to_f64(); let lr_t = learning_rate * (1.0 - step_f / NUM_STEPS.to_f64()); for pi in 0..num_params { let g = step_tape.vg(pi); adam_m[pi] = beta1 * adam_m[pi] + (1.0 - beta1) * g; adam_v[pi] = beta2 * adam_v[pi] + (1.0 - beta2) * g * g; let m_hat = adam_m[pi] / (1.0 - math.pow(beta1, step_f)); let v_hat = adam_v[pi] / (1.0 - math.pow(beta2, step_f)); let new_data = t.vd(pi) - lr_t * m_hat / (math.sqrt(v_hat) + eps_adam); vsd(t, pi, new_data); } let loss_val = step_tape.vd(loss); println(f"step {step + 1} / {NUM_STEPS} | loss {loss_val}"); } // --- Inference --- let temperature = 0.5; println("--- inference (new, hallucinated names) ---"); for sample_idx in 0..20 { // Fresh tape for inference (copy params) let inf_tape = tape_new(); for ip in 0..num_params { inf_tape.val(t.vd(ip)); } // Fresh KV cache let inf_keys = init_kv_cache(BLOCK_SIZE * N_EMBD); let inf_vals = init_kv_cache(BLOCK_SIZE * N_EMBD); var token_id = bos; var sample = ""; var gen_pos = 0; while gen_pos < BLOCK_SIZE { let logits = gpt_forward(inf_tape, model, token_id, gen_pos, inf_keys, inf_vals, vocab_size); // Apply temperature let scaled: Vec<Val> = Vec::new(); for d in 0..vocab_size { scaled.push(inf_tape.vmul_const(logits[d], 1.0 / temperature)); } let probs = softmax(inf_tape, scaled); // Sample next token let cum_weights: Vec<f64> = Vec::new(); var cum_total = 0.0; for d in 0..vocab_size { cum_total += inf_tape.vd(probs[d]); cum_weights.push(cum_total); } token_id = random.choices(cum_weights, cum_total, vocab_size); if token_id == bos { // End of sequence break; } else { // Append character from vocab let ch_str = uchars[token_id]; sample = sample + ch_str; gen_pos += 1; } } println(f"sample {sample_idx + 1}: {sample}"); } 0 }