mana-swift-llm/Sources/ManaLLM/GemmaBackend.swift
till fd376bbdce L-1+L-2+L-3: mana-swift-llm Initial — lift aus memoro-native
Neues Swift-Package mit lokalen LLM-Backends für alle nativen mana-
e.V.-Apps. Lift der bisher Memoro-eigenen Files in
`memoro-native/Sources/Core/AI/` plus zwei neue Layer:
ManaSharedModels (App-Group-Container-Helper) und ManaLLM-Facade.

Library-Products:
- ManaLLM — Backend-Abstraktion (FoundationModels, Gemma 4 E2B/E4B,
  NoOp), Router mit Priority-Liste, High-Level-Facade
  `ManaLLM.summarize/generate/classify` mit fast/creative/deep Level.
- ManaLLMShared — App-Group `group.ev.mana.models` Container,
  HF_HUB_CACHE-Setup, Legacy-Fallback wenn Group fehlt.

Lift-Anpassungen ggü. memoro:
- public-Marker auf protocol + types + actors
- generischer `generate(prompt:instructions:maxTokens:)` zu
  LLMBackend-Protocol hinzu; `summarize` als Default-Impl auf
  Basis von generate
- AppleFMBackend behält optimierten @Generable-Summary-Path
- GemmaBackend nutzt ManaSharedModels.effectiveCacheURL() statt
  eigenen Application-Support-Pfad; allowsCellular kommt jetzt
  als Initializer-Param statt App-Settings-Lookup
- LLMRouter: Memoro-spezifische User-Pref-Store-Logic durch
  Priority-Liste-API ersetzt
- LLMLog-Subsystem `ev.mana.llm` statt App-eigenes `Log.ai`

Build: `swift build` clean (76s, MLX-Toolchain-Resolution beim
ersten Lauf). 4/4 Parser-Tests grün.

Doku: ../mana/docs/MANA_LLM.md (Plattform-SOT), CLAUDE.md
(Konventionen + Lift-Tabelle).

Folge: L-4 Memoro auf ManaLLM umstellen, L-5 pageta-Pilot.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 22:55:32 +02:00

282 lines
9.4 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Foundation
import HuggingFace
import ManaLLMShared
import MLXHuggingFace
import MLXLLM
import MLXLMCommon
import OSLog
import Tokenizers
/// `LLMBackend` über MLX-Swift-LM mit einem Gemma-Modell aus dem
/// HuggingFace `mlx-community/`-Namespace. Lädt das Modell beim
/// ersten `prepare()` herunter und hält den `ModelContainer` für
/// die App-Lifetime im Speicher.
///
/// **Cross-App-Sharing:** der HuggingFace-Cache lebt im
/// `ManaSharedModels.effectiveCacheURL()` bei korrekt
/// konfiguriertem App-Group-Entitlement `group.ev.mana.models` ist
/// das der gemeinsame Container, sonst der App-eigene Application-
/// Support-Fallback. **Eine App lädt, alle anderen lesen**.
///
/// **Modell-Wahl (Mai 2026):** `gemma4_e2b_it_4bit` und
/// `gemma4_e4b_it_4bit` aus `LLMRegistry`. Direkte Quellen auf HF:
/// - mlx-community/gemma-4-e2b-it-4bit (~1.3 GB)
/// - mlx-community/gemma-4-e4b-it-4bit (~2.5 GB)
///
/// **WiFi-only-Download:** Default. Apps können `allowsCellular: true`
/// im Initializer übergeben, wenn der User explizit über Mobilfunk
/// laden will.
public actor GemmaBackend: LLMBackend {
public enum Variant: Sendable {
case e2b
case e4b
var modelConfiguration: ModelConfiguration {
switch self {
case .e2b: LLMRegistry.gemma4_e2b_it_4bit
case .e4b: LLMRegistry.gemma4_e4b_it_4bit
}
}
var estimatedBytes: Int64 {
switch self {
case .e2b: 3_614_000_000
case .e4b: 5_250_000_000
}
}
var hfRepoFolderName: String {
switch self {
case .e2b: "models--mlx-community--gemma-4-e2b-it-4bit"
case .e4b: "models--mlx-community--gemma-4-e4b-it-4bit"
}
}
var hfRepoID: String {
switch self {
case .e2b: "mlx-community/gemma-4-e2b-it-4bit"
case .e4b: "mlx-community/gemma-4-e4b-it-4bit"
}
}
}
public let identifier: LLMBackendID
private let variant: Variant
private let allowsCellular: Bool
private var container: ModelContainer?
public init(variant: Variant, allowsCellular: Bool = false) {
self.variant = variant
self.allowsCellular = allowsCellular
identifier = variant == .e2b ? .gemmaE2B : .gemmaE4B
}
// MARK: - Availability
public func availability() async -> LLMAvailability {
if container != nil { return .available }
if isModelCached() { return .available }
return .requiresDownload(estimatedBytes: variant.estimatedBytes)
}
private func isModelCached() -> Bool {
guard let cacheRoot = huggingFaceCacheRoot() else { return false }
let repoDir = cacheRoot
.appending(path: variant.hfRepoFolderName)
.appending(path: "snapshots")
guard FileManager.default.fileExists(atPath: repoDir.path()) else { return false }
if let entries = try? FileManager.default.contentsOfDirectory(
at: repoDir, includingPropertiesForKeys: nil
) {
for entry in entries {
let cfg = entry.appending(path: "config.json")
if FileManager.default.fileExists(atPath: cfg.path()) {
return true
}
}
}
return false
}
/// HF-Cache-Pfad. Priorität:
/// 1. `HF_HUB_CACHE` env-Variable (z.B. via
/// `ManaSharedModels.configureHuggingFaceCacheEnv()` im
/// App-Boot gesetzt Standard für mana-Apps).
/// 2. `ManaSharedModels.effectiveCacheURL()` App-Group-
/// Container falls verfügbar, sonst App-eigener App-Support.
private func huggingFaceCacheRoot() -> URL? {
if let envCache = ProcessInfo.processInfo.environment["HF_HUB_CACHE"] {
return URL(fileURLWithPath: envCache)
}
return ManaSharedModels.effectiveCacheURL()
}
// MARK: - Prepare (Download + Init)
public func prepare(
onProgress: @Sendable @escaping (LLMPrepareUpdate) -> Void
) async throws {
if container != nil {
onProgress(LLMPrepareUpdate(stage: .ready, fractionCompleted: 1.0))
return
}
onProgress(LLMPrepareUpdate(stage: .downloading, fractionCompleted: 0))
let hub = makeHubClient()
do {
let loaded = try await LLMModelFactory.shared.loadContainer(
from: #hubDownloader(hub),
using: #huggingFaceTokenizerLoader(),
configuration: variant.modelConfiguration
) { progress in
let total = progress.totalUnitCount
let done = progress.completedUnitCount
let fraction = progress.fractionCompleted
LLMLog.download.debug(
"Gemma progress: completed=\(done, privacy: .public)/\(total, privacy: .public) fraction=\(fraction, privacy: .public)"
)
onProgress(LLMPrepareUpdate(
stage: .downloading,
fractionCompleted: fraction,
bytesCompleted: done > 0 ? done : nil,
bytesTotal: total > 1 ? total : nil
))
}
container = loaded
onProgress(LLMPrepareUpdate(stage: .ready, fractionCompleted: 1.0))
let name = variant.modelConfiguration.name
LLMLog.backend.notice("GemmaBackend ready (\(name, privacy: .public))")
} catch {
let message = String(describing: error)
LLMLog.backend.error("GemmaBackend prepare failed: \(message, privacy: .public)")
throw error
}
}
private func makeHubClient() -> HubClient {
let config = URLSessionConfiguration.default
config.allowsCellularAccess = allowsCellular
config.timeoutIntervalForRequest = 60
config.timeoutIntervalForResource = 7200
config.waitsForConnectivity = true
let session = URLSession(configuration: config)
let cache: HubCache? = huggingFaceCacheRoot().map {
HubCache(cacheDirectory: $0)
}
return HubClient(session: session, cache: cache)
}
// MARK: - Generate
public func generate(
prompt: String,
instructions: String?,
maxTokens _: Int
) async -> String? {
let trimmed = prompt.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
guard let container else {
LLMLog.backend.notice("GemmaBackend.generate called before prepare — returning nil")
return nil
}
let session = ChatSession(
container,
instructions: instructions ?? ""
)
do {
let response = try await session.respond(to: trimmed)
let text = response.trimmingCharacters(in: .whitespacesAndNewlines)
LLMLog.backend.notice(
"Gemma generate OK (\(text.count, privacy: .public) chars)"
)
return text
} catch {
let message = String(describing: error)
LLMLog.backend.error("Gemma generate failed: \(message, privacy: .public)")
return nil
}
}
// MARK: - Summary (Gemma-optimierter JSON-Path)
public func summarize(transcript: String) async -> LLMSummary? {
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
guard !trimmed.isEmpty else { return nil }
guard let container else {
LLMLog.backend.notice("GemmaBackend.summarize called before prepare — returning nil")
return nil
}
// Gemma 4 hat 256 K Token-Window wir klippen weicher als Apple FM
// (8000 chars statt 3000). Map-Reduce über lange Inputs liegt im
// Aufrufer-Pfad.
let clipped = String(trimmed.prefix(8000))
let instructions = "Du bist ein deutscher Assistent, der gesprochene "
+ "Sprachmemos kurz zusammenfasst. Antworte auf Deutsch, ohne Floskeln, "
+ "ohne Anrede. Antworte ausschließlich als JSON-Objekt mit den "
+ "Feldern \"headline\" (String, maximal 80 Zeichen, kein Punkt am Ende, "
+ "keine Anführungszeichen) und \"intro\" (String, 12 Sätze). "
+ "Keine zusätzlichen Felder, kein Markdown, keine Erklärungen."
let prompt = "Transkript:\n\(clipped)\n\nGib jetzt das JSON aus."
let session = ChatSession(container, instructions: instructions)
do {
let response = try await session.respond(to: prompt)
return parseSummary(response)
} catch {
let message = String(describing: error)
LLMLog.backend.error("GemmaBackend summarize failed: \(message, privacy: .public)")
return nil
}
}
/// Löscht das Modell aus dem HF-Cache. Achtung: in einem
/// Shared-Container betrifft das ALLE teilnehmenden Apps.
public func removeCachedModel() throws {
container = nil
try ManaSharedModels.removeModel(repo: variant.hfRepoID)
LLMLog.backend.notice(
"GemmaBackend removed cache for \(self.variant.hfRepoFolderName, privacy: .public)"
)
}
/// Extrahiert headline + intro aus einem Modell-Output. JSON
/// bevorzugt, mit grober Heuristik als Fallback (kleine Modelle
/// halten sich nicht immer ans Schema).
private func parseSummary(_ raw: String) -> LLMSummary? {
let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines)
if let jsonStart = trimmed.firstIndex(of: "{"),
let jsonEnd = trimmed.lastIndex(of: "}")
{
let jsonString = String(trimmed[jsonStart ... jsonEnd])
if let data = jsonString.data(using: .utf8),
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let headline = obj["headline"] as? String,
let intro = obj["intro"] as? String
{
let trimSet = CharacterSet(
charactersIn: "\"\u{201E}\u{201C}\u{201D}.\u{00BB}\u{00AB}"
)
let cleanHeadline = headline.trimmingCharacters(in: .whitespacesAndNewlines)
.trimmingCharacters(in: trimSet)
let cleanIntro = intro.trimmingCharacters(in: .whitespacesAndNewlines)
LLMLog.backend.notice(
"GemmaBackend summary OK (json, headline=\(cleanHeadline.count, privacy: .public)c)"
)
return LLMSummary(
headline: String(cleanHeadline.prefix(80)),
intro: cleanIntro
)
}
}
LLMLog.backend.notice("GemmaBackend summary fallback (kein valides JSON)")
let sentences = trimmed.split(separator: ".", maxSplits: 1, omittingEmptySubsequences: true)
let headline = String(sentences.first ?? "").prefix(80)
let intro = sentences.count > 1
? String(sentences[1]).trimmingCharacters(in: .whitespacesAndNewlines)
: ""
return LLMSummary(headline: String(headline), intro: intro)
}
}