From e8423e7551247ae970082ceec842c6aea11c598d Mon Sep 17 00:00:00 2001 From: Till JS Date: Wed, 8 Apr 2026 23:19:24 +0200 Subject: [PATCH] fix(local-llm): use two-step tokenization to fix Gemma 4 generate crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt to fix the "Cannot read properties of null (reading 'dims')" chat error was incomplete: I only stopped passing the bogus return_tensor:'pt' option to apply_chat_template. The underlying issue was that apply_chat_template's all-in-one mode (return_dict:true) does not produce a proper Tensor-backed { input_ids, attention_mask } pair for multimodal-capable processors like Gemma4Processor — it returns a shape that has no .dims on input_ids, so model.generate() crashes deep inside the forward pass the moment it tries to read the sequence length. Switch to the documented two-step pattern from the Gemma 4 model card: call apply_chat_template with tokenize:false to get the formatted prompt as a plain string, then run that string through processor.tokenizer with return_tensors:'pt' to get a proper Tensor pair. The tokenizer's return_tensors option is the *Python* convention and IS supported by transformers.js's Tokenizer class (the API name collision between apply_chat_template's return_tensor boolean and Tokenizer's return_tensors string is one of those nasty spots where the JS port intentionally diverges from Python). Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/local-llm/src/engine.ts | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/packages/local-llm/src/engine.ts b/packages/local-llm/src/engine.ts index 86d9aa384..a11d90aea 100644 --- a/packages/local-llm/src/engine.ts +++ b/packages/local-llm/src/engine.ts @@ -243,18 +243,27 @@ export class LocalLLMEngine { const start = performance.now(); - // Apply Gemma's chat template via the processor's tokenizer wrapper. - // `add_generation_prompt: true` appends the tokens that tell the - // model "now generate an assistant turn". `return_dict: true` makes - // it return { input_ids, attention_mask } so we can spread it into - // model.generate(). NOTE: do NOT pass `return_tensor: 'pt'` — that - // is the Python `transformers` convention; transformers.js's - // equivalent option is just `return_tensor: true`, which is the - // default anyway. Passing the string broke nothing in older - // versions but made input shape detection unreliable. - const inputs = await this.processor.apply_chat_template(options.messages, { + // Two-step input prep, matching the Gemma 4 model-card example: + // 1. Apply the chat template with tokenize:false to get the + // formatted prompt as a plain string (no tokens, no tensor). + // 2. Run the string through the processor's tokenizer with + // return_tensors:'pt' to get a proper { input_ids, attention_mask } + // pair backed by transformers.js Tensor objects. + // + // We previously asked apply_chat_template to do everything in one + // shot via `return_dict: true`, but for Gemma4ForConditionalGeneration + // that path returned a malformed shape (no .dims on input_ids), and + // model.generate() then crashed deep inside the forward pass with + // "Cannot read properties of null (reading 'dims')" — surfacing as + // an opaque chat error. The two-step path is what every transformers.js + // example for multimodal-capable processors uses. + const promptText: string = this.processor.apply_chat_template(options.messages, { add_generation_prompt: true, - return_dict: true, + tokenize: false, + }); + + const inputs = this.processor.tokenizer(promptText, { + return_tensors: 'pt', }); const promptTokenCount = this.tensorLength(inputs?.input_ids);