Neues Swift-Package mit lokalen LLM-Backends für alle nativen mana- e.V.-Apps. Lift der bisher Memoro-eigenen Files in `memoro-native/Sources/Core/AI/` plus zwei neue Layer: ManaSharedModels (App-Group-Container-Helper) und ManaLLM-Facade. Library-Products: - ManaLLM — Backend-Abstraktion (FoundationModels, Gemma 4 E2B/E4B, NoOp), Router mit Priority-Liste, High-Level-Facade `ManaLLM.summarize/generate/classify` mit fast/creative/deep Level. - ManaLLMShared — App-Group `group.ev.mana.models` Container, HF_HUB_CACHE-Setup, Legacy-Fallback wenn Group fehlt. Lift-Anpassungen ggü. memoro: - public-Marker auf protocol + types + actors - generischer `generate(prompt:instructions:maxTokens:)` zu LLMBackend-Protocol hinzu; `summarize` als Default-Impl auf Basis von generate - AppleFMBackend behält optimierten @Generable-Summary-Path - GemmaBackend nutzt ManaSharedModels.effectiveCacheURL() statt eigenen Application-Support-Pfad; allowsCellular kommt jetzt als Initializer-Param statt App-Settings-Lookup - LLMRouter: Memoro-spezifische User-Pref-Store-Logic durch Priority-Liste-API ersetzt - LLMLog-Subsystem `ev.mana.llm` statt App-eigenes `Log.ai` Build: `swift build` clean (76s, MLX-Toolchain-Resolution beim ersten Lauf). 4/4 Parser-Tests grün. Doku: ../mana/docs/MANA_LLM.md (Plattform-SOT), CLAUDE.md (Konventionen + Lift-Tabelle). Folge: L-4 Memoro auf ManaLLM umstellen, L-5 pageta-Pilot. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
282 lines
9.4 KiB
Swift
282 lines
9.4 KiB
Swift
import Foundation
|
||
import HuggingFace
|
||
import ManaLLMShared
|
||
import MLXHuggingFace
|
||
import MLXLLM
|
||
import MLXLMCommon
|
||
import OSLog
|
||
import Tokenizers
|
||
|
||
/// `LLMBackend` über MLX-Swift-LM mit einem Gemma-Modell aus dem
|
||
/// HuggingFace `mlx-community/`-Namespace. Lädt das Modell beim
|
||
/// ersten `prepare()` herunter und hält den `ModelContainer` für
|
||
/// die App-Lifetime im Speicher.
|
||
///
|
||
/// **Cross-App-Sharing:** der HuggingFace-Cache lebt im
|
||
/// `ManaSharedModels.effectiveCacheURL()` — bei korrekt
|
||
/// konfiguriertem App-Group-Entitlement `group.ev.mana.models` ist
|
||
/// das der gemeinsame Container, sonst der App-eigene Application-
|
||
/// Support-Fallback. **Eine App lädt, alle anderen lesen**.
|
||
///
|
||
/// **Modell-Wahl (Mai 2026):** `gemma4_e2b_it_4bit` und
|
||
/// `gemma4_e4b_it_4bit` aus `LLMRegistry`. Direkte Quellen auf HF:
|
||
/// - mlx-community/gemma-4-e2b-it-4bit (~1.3 GB)
|
||
/// - mlx-community/gemma-4-e4b-it-4bit (~2.5 GB)
|
||
///
|
||
/// **WiFi-only-Download:** Default. Apps können `allowsCellular: true`
|
||
/// im Initializer übergeben, wenn der User explizit über Mobilfunk
|
||
/// laden will.
|
||
public actor GemmaBackend: LLMBackend {
|
||
public enum Variant: Sendable {
|
||
case e2b
|
||
case e4b
|
||
|
||
var modelConfiguration: ModelConfiguration {
|
||
switch self {
|
||
case .e2b: LLMRegistry.gemma4_e2b_it_4bit
|
||
case .e4b: LLMRegistry.gemma4_e4b_it_4bit
|
||
}
|
||
}
|
||
|
||
var estimatedBytes: Int64 {
|
||
switch self {
|
||
case .e2b: 3_614_000_000
|
||
case .e4b: 5_250_000_000
|
||
}
|
||
}
|
||
|
||
var hfRepoFolderName: String {
|
||
switch self {
|
||
case .e2b: "models--mlx-community--gemma-4-e2b-it-4bit"
|
||
case .e4b: "models--mlx-community--gemma-4-e4b-it-4bit"
|
||
}
|
||
}
|
||
|
||
var hfRepoID: String {
|
||
switch self {
|
||
case .e2b: "mlx-community/gemma-4-e2b-it-4bit"
|
||
case .e4b: "mlx-community/gemma-4-e4b-it-4bit"
|
||
}
|
||
}
|
||
}
|
||
|
||
public let identifier: LLMBackendID
|
||
private let variant: Variant
|
||
private let allowsCellular: Bool
|
||
private var container: ModelContainer?
|
||
|
||
public init(variant: Variant, allowsCellular: Bool = false) {
|
||
self.variant = variant
|
||
self.allowsCellular = allowsCellular
|
||
identifier = variant == .e2b ? .gemmaE2B : .gemmaE4B
|
||
}
|
||
|
||
// MARK: - Availability
|
||
|
||
public func availability() async -> LLMAvailability {
|
||
if container != nil { return .available }
|
||
if isModelCached() { return .available }
|
||
return .requiresDownload(estimatedBytes: variant.estimatedBytes)
|
||
}
|
||
|
||
private func isModelCached() -> Bool {
|
||
guard let cacheRoot = huggingFaceCacheRoot() else { return false }
|
||
let repoDir = cacheRoot
|
||
.appending(path: variant.hfRepoFolderName)
|
||
.appending(path: "snapshots")
|
||
guard FileManager.default.fileExists(atPath: repoDir.path()) else { return false }
|
||
if let entries = try? FileManager.default.contentsOfDirectory(
|
||
at: repoDir, includingPropertiesForKeys: nil
|
||
) {
|
||
for entry in entries {
|
||
let cfg = entry.appending(path: "config.json")
|
||
if FileManager.default.fileExists(atPath: cfg.path()) {
|
||
return true
|
||
}
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
/// HF-Cache-Pfad. Priorität:
|
||
/// 1. `HF_HUB_CACHE` env-Variable (z.B. via
|
||
/// `ManaSharedModels.configureHuggingFaceCacheEnv()` im
|
||
/// App-Boot gesetzt — Standard für mana-Apps).
|
||
/// 2. `ManaSharedModels.effectiveCacheURL()` — App-Group-
|
||
/// Container falls verfügbar, sonst App-eigener App-Support.
|
||
private func huggingFaceCacheRoot() -> URL? {
|
||
if let envCache = ProcessInfo.processInfo.environment["HF_HUB_CACHE"] {
|
||
return URL(fileURLWithPath: envCache)
|
||
}
|
||
return ManaSharedModels.effectiveCacheURL()
|
||
}
|
||
|
||
// MARK: - Prepare (Download + Init)
|
||
|
||
public func prepare(
|
||
onProgress: @Sendable @escaping (LLMPrepareUpdate) -> Void
|
||
) async throws {
|
||
if container != nil {
|
||
onProgress(LLMPrepareUpdate(stage: .ready, fractionCompleted: 1.0))
|
||
return
|
||
}
|
||
onProgress(LLMPrepareUpdate(stage: .downloading, fractionCompleted: 0))
|
||
let hub = makeHubClient()
|
||
do {
|
||
let loaded = try await LLMModelFactory.shared.loadContainer(
|
||
from: #hubDownloader(hub),
|
||
using: #huggingFaceTokenizerLoader(),
|
||
configuration: variant.modelConfiguration
|
||
) { progress in
|
||
let total = progress.totalUnitCount
|
||
let done = progress.completedUnitCount
|
||
let fraction = progress.fractionCompleted
|
||
LLMLog.download.debug(
|
||
"Gemma progress: completed=\(done, privacy: .public)/\(total, privacy: .public) fraction=\(fraction, privacy: .public)"
|
||
)
|
||
onProgress(LLMPrepareUpdate(
|
||
stage: .downloading,
|
||
fractionCompleted: fraction,
|
||
bytesCompleted: done > 0 ? done : nil,
|
||
bytesTotal: total > 1 ? total : nil
|
||
))
|
||
}
|
||
container = loaded
|
||
onProgress(LLMPrepareUpdate(stage: .ready, fractionCompleted: 1.0))
|
||
let name = variant.modelConfiguration.name
|
||
LLMLog.backend.notice("GemmaBackend ready (\(name, privacy: .public))")
|
||
} catch {
|
||
let message = String(describing: error)
|
||
LLMLog.backend.error("GemmaBackend prepare failed: \(message, privacy: .public)")
|
||
throw error
|
||
}
|
||
}
|
||
|
||
private func makeHubClient() -> HubClient {
|
||
let config = URLSessionConfiguration.default
|
||
config.allowsCellularAccess = allowsCellular
|
||
config.timeoutIntervalForRequest = 60
|
||
config.timeoutIntervalForResource = 7200
|
||
config.waitsForConnectivity = true
|
||
let session = URLSession(configuration: config)
|
||
|
||
let cache: HubCache? = huggingFaceCacheRoot().map {
|
||
HubCache(cacheDirectory: $0)
|
||
}
|
||
return HubClient(session: session, cache: cache)
|
||
}
|
||
|
||
// MARK: - Generate
|
||
|
||
public func generate(
|
||
prompt: String,
|
||
instructions: String?,
|
||
maxTokens _: Int
|
||
) async -> String? {
|
||
let trimmed = prompt.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
guard !trimmed.isEmpty else { return nil }
|
||
guard let container else {
|
||
LLMLog.backend.notice("GemmaBackend.generate called before prepare — returning nil")
|
||
return nil
|
||
}
|
||
let session = ChatSession(
|
||
container,
|
||
instructions: instructions ?? ""
|
||
)
|
||
do {
|
||
let response = try await session.respond(to: trimmed)
|
||
let text = response.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
LLMLog.backend.notice(
|
||
"Gemma generate OK (\(text.count, privacy: .public) chars)"
|
||
)
|
||
return text
|
||
} catch {
|
||
let message = String(describing: error)
|
||
LLMLog.backend.error("Gemma generate failed: \(message, privacy: .public)")
|
||
return nil
|
||
}
|
||
}
|
||
|
||
// MARK: - Summary (Gemma-optimierter JSON-Path)
|
||
|
||
public func summarize(transcript: String) async -> LLMSummary? {
|
||
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
guard !trimmed.isEmpty else { return nil }
|
||
guard let container else {
|
||
LLMLog.backend.notice("GemmaBackend.summarize called before prepare — returning nil")
|
||
return nil
|
||
}
|
||
|
||
// Gemma 4 hat 256 K Token-Window — wir klippen weicher als Apple FM
|
||
// (8000 chars statt 3000). Map-Reduce über lange Inputs liegt im
|
||
// Aufrufer-Pfad.
|
||
let clipped = String(trimmed.prefix(8000))
|
||
|
||
let instructions = "Du bist ein deutscher Assistent, der gesprochene "
|
||
+ "Sprachmemos kurz zusammenfasst. Antworte auf Deutsch, ohne Floskeln, "
|
||
+ "ohne Anrede. Antworte ausschließlich als JSON-Objekt mit den "
|
||
+ "Feldern \"headline\" (String, maximal 80 Zeichen, kein Punkt am Ende, "
|
||
+ "keine Anführungszeichen) und \"intro\" (String, 1–2 Sätze). "
|
||
+ "Keine zusätzlichen Felder, kein Markdown, keine Erklärungen."
|
||
|
||
let prompt = "Transkript:\n\(clipped)\n\nGib jetzt das JSON aus."
|
||
|
||
let session = ChatSession(container, instructions: instructions)
|
||
do {
|
||
let response = try await session.respond(to: prompt)
|
||
return parseSummary(response)
|
||
} catch {
|
||
let message = String(describing: error)
|
||
LLMLog.backend.error("GemmaBackend summarize failed: \(message, privacy: .public)")
|
||
return nil
|
||
}
|
||
}
|
||
|
||
/// Löscht das Modell aus dem HF-Cache. Achtung: in einem
|
||
/// Shared-Container betrifft das ALLE teilnehmenden Apps.
|
||
public func removeCachedModel() throws {
|
||
container = nil
|
||
try ManaSharedModels.removeModel(repo: variant.hfRepoID)
|
||
LLMLog.backend.notice(
|
||
"GemmaBackend removed cache for \(self.variant.hfRepoFolderName, privacy: .public)"
|
||
)
|
||
}
|
||
|
||
/// Extrahiert headline + intro aus einem Modell-Output. JSON
|
||
/// bevorzugt, mit grober Heuristik als Fallback (kleine Modelle
|
||
/// halten sich nicht immer ans Schema).
|
||
private func parseSummary(_ raw: String) -> LLMSummary? {
|
||
let trimmed = raw.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
if let jsonStart = trimmed.firstIndex(of: "{"),
|
||
let jsonEnd = trimmed.lastIndex(of: "}")
|
||
{
|
||
let jsonString = String(trimmed[jsonStart ... jsonEnd])
|
||
if let data = jsonString.data(using: .utf8),
|
||
let obj = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||
let headline = obj["headline"] as? String,
|
||
let intro = obj["intro"] as? String
|
||
{
|
||
let trimSet = CharacterSet(
|
||
charactersIn: "\"\u{201E}\u{201C}\u{201D}.\u{00BB}\u{00AB}"
|
||
)
|
||
let cleanHeadline = headline.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
.trimmingCharacters(in: trimSet)
|
||
let cleanIntro = intro.trimmingCharacters(in: .whitespacesAndNewlines)
|
||
LLMLog.backend.notice(
|
||
"GemmaBackend summary OK (json, headline=\(cleanHeadline.count, privacy: .public)c)"
|
||
)
|
||
return LLMSummary(
|
||
headline: String(cleanHeadline.prefix(80)),
|
||
intro: cleanIntro
|
||
)
|
||
}
|
||
}
|
||
LLMLog.backend.notice("GemmaBackend summary fallback (kein valides JSON)")
|
||
let sentences = trimmed.split(separator: ".", maxSplits: 1, omittingEmptySubsequences: true)
|
||
let headline = String(sentences.first ?? "").prefix(80)
|
||
let intro = sentences.count > 1
|
||
? String(sentences[1]).trimmingCharacters(in: .whitespacesAndNewlines)
|
||
: ""
|
||
return LLMSummary(headline: String(headline), intro: intro)
|
||
}
|
||
}
|