use memory limit API (#13)

* add buffer cache limit * swift-format * a more reasonable size * add memory stats to command line tool, update to final api * add note about changing models
2024-03-05 15:22:12 -08:00
parent 430b464c8d
commit 61105bf0c4
3 changed files with 115 additions and 39 deletions
--- a/Applications/LLMEval/ContentView.swift
+++ b/Applications/LLMEval/ContentView.swift
@@ -73,6 +73,9 @@ class LLMEvaluator {
    func load() async throws -> (LLMModel, LLM.Tokenizer) {
        switch loadState {
        case .idle:
+            // limit the buffer cache
+            MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
+
            let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration) {
                [modelConfiguration] progress in
                DispatchQueue.main.sync {
@@ -80,6 +83,8 @@ class LLMEvaluator {
                        "Downloading \(modelConfiguration.id): \(Int(progress.fractionCompleted * 100))%"
                }
            }
+            self.output =
+                "Loaded \(modelConfiguration.id).  Weights: \(MLX.GPU.activeMemory / 1024 / 1024)M"
            loadState = .loaded(model, tokenizer)
            return (model, tokenizer)