From 61105bf0c41b3edda68a9087f0e0c817b1dd580e Mon Sep 17 00:00:00 2001
From: David Koski <46639364+davidkoski@users.noreply.github.com>
Date: Tue, 5 Mar 2024 15:22:12 -0800
Subject: [PATCH] use memory limit API (#13)

* add buffer cache limit

* swift-format

* a more reasonable size

* add memory stats to command line tool, update to final api

* add note about changing models
---
 Applications/LLMEval/ContentView.swift |   5 +
 Applications/LLMEval/README.md         |  16 +++
 Tools/llm-tool/LLMTool.swift           | 133 +++++++++++++++++--------
 3 files changed, 115 insertions(+), 39 deletions(-)

diff --git a/Applications/LLMEval/ContentView.swift b/Applications/LLMEval/ContentView.swift
index 3741c58..d31bb12 100644
--- a/Applications/LLMEval/ContentView.swift
+++ b/Applications/LLMEval/ContentView.swift
@@ -73,6 +73,9 @@ class LLMEvaluator {
     func load() async throws -> (LLMModel, LLM.Tokenizer) {
         switch loadState {
         case .idle:
+            // limit the buffer cache
+            MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
+
             let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration) {
                 [modelConfiguration] progress in
                 DispatchQueue.main.sync {
@@ -80,6 +83,8 @@ class LLMEvaluator {
                         "Downloading \(modelConfiguration.id): \(Int(progress.fractionCompleted * 100))%"
                 }
             }
+            self.output =
+                "Loaded \(modelConfiguration.id).  Weights: \(MLX.GPU.activeMemory / 1024 / 1024)M"
             loadState = .loaded(model, tokenizer)
             return (model, tokenizer)
 
diff --git a/Applications/LLMEval/README.md b/Applications/LLMEval/README.md
index 2735b1a..cb2cf3d 100644
--- a/Applications/LLMEval/README.md
+++ b/Applications/LLMEval/README.md
@@ -16,9 +16,25 @@ Some notes about the setup:
 
 - this downloads models from hugging face so LLMEval -> Signing & Capabilities has the "Outgoing Connections (Client)" set in the App Sandbox
 - LLM models are large so this uses the Increased Memory Limit entitlement on iOS to allow ... increased memory limits for devices that have more memory
+- `MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)` is used to limit the buffer cache size
 - The Phi2 4 bit model is small enough to run on some iPhone models
     - this can be changed by editing `let modelConfiguration = ModelConfiguration.phi4bit`
 
+### Trying Different Models
+
+The example application uses Phi2 model by default, see [ContentView.swift](ContentView.swift#L58):
+
+```
+    /// this controls which model loads -- phi4bit is one of the smaller ones so this will fit on
+    /// more devices
+    let modelConfiguration = ModelConfiguration.phi4bit
+```
+
+There are some pre-configured models in [LLM/Models.swift](../../Libraries/LLM/Models.swift#L62)
+and you can load any weights from Hugging Face where there
+is a model architecture defined and you have enough
+memory.
+
 ### Troubleshooting
 
 If the program crashes with a very deep stack trace you may need to build
diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift
index 61f130e..a563a6b 100644
--- a/Tools/llm-tool/LLMTool.swift
+++ b/Tools/llm-tool/LLMTool.swift
@@ -14,12 +14,7 @@ struct LLMTool: AsyncParsableCommand {
         defaultSubcommand: SyncGenerator.self)
 }
 
-struct SyncGenerator: AsyncParsableCommand {
-
-    static var configuration = CommandConfiguration(
-        commandName: "sync",
-        abstract: "Synchronous generator"
-    )
+struct LLMArguments: ParsableArguments {
 
     @Option(name: .long, help: "Name of the huggingface model")
     var model: String = "mlx-community/Mistral-7B-v0.1-hf-4bit-mlx"
@@ -36,20 +31,91 @@ struct SyncGenerator: AsyncParsableCommand {
     @Option(name: .long, help: "The PRNG seed")
     var seed: UInt64 = 0
 
-    @MainActor
-    func run() async throws {
+    @Flag(help: "Show memory stats")
+    var memoryStats = false
+
+    @Option(name: .long, help: "Maximum cache size in M")
+    var cacheSize: Int?
+
+    @Option(name: .long, help: "Maximum memory size in M")
+    var memorySize: Int?
+
+    var startMemory: GPU.Snapshot?
+
+    mutating func load() async throws -> (LLMModel, Tokenizer, ModelConfiguration) {
         MLXRandom.seed(seed)
 
+        if let cacheSize {
+            GPU.set(cacheLimit: cacheSize * 1024 * 1024)
+        }
+
+        if let memorySize {
+            GPU.set(memoryLimit: memorySize * 1024 * 1024)
+        }
+
         let modelConfiguration = ModelConfiguration.configuration(id: model)
-        let (model, tokenizer) = try await load(configuration: modelConfiguration)
+        let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration)
 
-        print("Model loaded -> \(self.model)")
+        startMemory = GPU.snapshot()
 
-        let prompt = modelConfiguration.prepare(prompt: self.prompt)
+        return (model, tokenizer, modelConfiguration)
+    }
+
+    func tokenizePropmpt(configuration: ModelConfiguration, tokenizer: Tokenizer) -> (String, [Int])
+    {
+        let prompt = configuration.prepare(prompt: self.prompt)
         let promptTokens = tokenizer.encode(text: prompt)
 
+        return (prompt, promptTokens)
+    }
+
+    func reportMemoryStatistics() {
+        if memoryStats, let startMemory {
+            let endMemory = GPU.snapshot()
+
+            print("=======")
+            print("Memory size: \(GPU.memoryLimit / 1024)K")
+            print("Cache size:  \(GPU.cacheLimit / 1024)K")
+
+            print("")
+            print("=======")
+            print("Starting memory")
+            print(startMemory.description)
+
+            print("")
+            print("=======")
+            print("Ending memory")
+            print(endMemory.description)
+
+            print("")
+            print("=======")
+            print("Growth")
+            print(startMemory.delta(endMemory).description)
+
+        }
+    }
+}
+
+struct SyncGenerator: AsyncParsableCommand {
+
+    static var configuration = CommandConfiguration(
+        commandName: "sync",
+        abstract: "Synchronous generator"
+    )
+
+    @OptionGroup var args: LLMArguments
+
+    @MainActor
+    mutating func run() async throws {
+        let (model, tokenizer, modelConfiguration) = try await args.load()
+
+        print("Model loaded -> \(modelConfiguration.id)")
+
+        let (prompt, promptTokens) = args.tokenizePropmpt(
+            configuration: modelConfiguration, tokenizer: tokenizer)
+
         print("Starting generation ...")
-        print(self.prompt, terminator: "")
+        print(prompt, terminator: "")
 
         var start = Date.timeIntervalSinceReferenceDate
         var promptTime: TimeInterval = 0
@@ -59,7 +125,8 @@ struct SyncGenerator: AsyncParsableCommand {
         var tokens = [Int]()
         var printed = 0
 
-        for token in TokenIterator(prompt: MLXArray(promptTokens), model: model, temp: temperature)
+        for token in TokenIterator(
+            prompt: MLXArray(promptTokens), model: model, temp: args.temperature)
         {
             if tokens.isEmpty {
                 eval(token)
@@ -83,7 +150,7 @@ struct SyncGenerator: AsyncParsableCommand {
 
             printed = fullOutput.count
 
-            if tokens.count == maxTokens {
+            if tokens.count == args.maxTokens {
                 break
             }
         }
@@ -98,6 +165,8 @@ struct SyncGenerator: AsyncParsableCommand {
             Prompt Tokens per second:     \((Double(promptTokens.count) / promptTime).formatted())
             Generation tokens per second: \((Double(tokens.count - 1) / generateTime).formatted())
             """)
+
+        args.reportMemoryStatistics()
     }
 }
 
@@ -112,35 +181,19 @@ struct AsyncGenerator: AsyncParsableCommand {
         abstract: "async generator"
     )
 
-    @Option(name: .long, help: "Name of the huggingface model")
-    var model: String = "mlx-community/Mistral-7B-v0.1-hf-4bit-mlx"
-
-    @Option(name: .shortAndLong, help: "The message to be processed by the model")
-    var prompt = "compare python and swift"
-
-    @Option(name: .shortAndLong, help: "Maximum number of tokens to generate")
-    var maxTokens = 100
-
-    @Option(name: .shortAndLong, help: "The sampling temperature")
-    var temperature: Float = 0.6
-
-    @Option(name: .long, help: "The PRNG seed")
-    var seed: UInt64 = 0
+    @OptionGroup var args: LLMArguments
 
     @MainActor
-    func run() async throws {
-        MLXRandom.seed(seed)
+    mutating func run() async throws {
+        let (model, tokenizer, modelConfiguration) = try await args.load()
 
-        let modelConfiguration = ModelConfiguration.configuration(id: model)
-        let (model, tokenizer) = try await load(configuration: modelConfiguration)
+        print("Model loaded -> \(modelConfiguration.id)")
 
-        print("Model loaded -> \(self.model)")
-
-        let prompt = modelConfiguration.prepare(prompt: self.prompt)
-        let promptTokens = tokenizer.encode(text: prompt)
+        let (prompt, promptTokens) = args.tokenizePropmpt(
+            configuration: modelConfiguration, tokenizer: tokenizer)
 
         print("Starting generation ...")
-        print(self.prompt, terminator: "")
+        print(prompt, terminator: "")
 
         var start = Date.timeIntervalSinceReferenceDate
         var promptTime: TimeInterval = 0
@@ -151,7 +204,7 @@ struct AsyncGenerator: AsyncParsableCommand {
         var printed = 0
 
         let (task, channel) = generate(
-            prompt: MLXArray(promptTokens), model: model, temp: temperature)
+            prompt: MLXArray(promptTokens), model: model, temp: args.temperature)
 
         for await token in channel {
             if tokens.isEmpty {
@@ -174,7 +227,7 @@ struct AsyncGenerator: AsyncParsableCommand {
 
             printed = fullOutput.count
 
-            if tokens.count == maxTokens {
+            if tokens.count == args.maxTokens {
                 break
             }
         }
@@ -193,6 +246,8 @@ struct AsyncGenerator: AsyncParsableCommand {
             Generation tokens per second: \((Double(tokens.count - 1) / generateTime).formatted())
             """)
 
+        args.reportMemoryStatistics()
+
         // wait for the task to complete -- since it is running async, it might
         // be in the middle of running the model
         try? await Task.sleep(for: .milliseconds(500))