From 61105bf0c41b3edda68a9087f0e0c817b1dd580e Mon Sep 17 00:00:00 2001 From: David Koski <46639364+davidkoski@users.noreply.github.com> Date: Tue, 5 Mar 2024 15:22:12 -0800 Subject: [PATCH] use memory limit API (#13) * add buffer cache limit * swift-format * a more reasonable size * add memory stats to command line tool, update to final api * add note about changing models --- Applications/LLMEval/ContentView.swift | 5 + Applications/LLMEval/README.md | 16 +++ Tools/llm-tool/LLMTool.swift | 133 +++++++++++++++++-------- 3 files changed, 115 insertions(+), 39 deletions(-) diff --git a/Applications/LLMEval/ContentView.swift b/Applications/LLMEval/ContentView.swift index 3741c58..d31bb12 100644 --- a/Applications/LLMEval/ContentView.swift +++ b/Applications/LLMEval/ContentView.swift @@ -73,6 +73,9 @@ class LLMEvaluator { func load() async throws -> (LLMModel, LLM.Tokenizer) { switch loadState { case .idle: + // limit the buffer cache + MLX.GPU.set(cacheLimit: 20 * 1024 * 1024) + let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration) { [modelConfiguration] progress in DispatchQueue.main.sync { @@ -80,6 +83,8 @@ class LLMEvaluator { "Downloading \(modelConfiguration.id): \(Int(progress.fractionCompleted * 100))%" } } + self.output = + "Loaded \(modelConfiguration.id). Weights: \(MLX.GPU.activeMemory / 1024 / 1024)M" loadState = .loaded(model, tokenizer) return (model, tokenizer) diff --git a/Applications/LLMEval/README.md b/Applications/LLMEval/README.md index 2735b1a..cb2cf3d 100644 --- a/Applications/LLMEval/README.md +++ b/Applications/LLMEval/README.md @@ -16,9 +16,25 @@ Some notes about the setup: - this downloads models from hugging face so LLMEval -> Signing & Capabilities has the "Outgoing Connections (Client)" set in the App Sandbox - LLM models are large so this uses the Increased Memory Limit entitlement on iOS to allow ... increased memory limits for devices that have more memory +- `MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)` is used to limit the buffer cache size - The Phi2 4 bit model is small enough to run on some iPhone models - this can be changed by editing `let modelConfiguration = ModelConfiguration.phi4bit` +### Trying Different Models + +The example application uses Phi2 model by default, see [ContentView.swift](ContentView.swift#L58): + +``` + /// this controls which model loads -- phi4bit is one of the smaller ones so this will fit on + /// more devices + let modelConfiguration = ModelConfiguration.phi4bit +``` + +There are some pre-configured models in [LLM/Models.swift](../../Libraries/LLM/Models.swift#L62) +and you can load any weights from Hugging Face where there +is a model architecture defined and you have enough +memory. + ### Troubleshooting If the program crashes with a very deep stack trace you may need to build diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift index 61f130e..a563a6b 100644 --- a/Tools/llm-tool/LLMTool.swift +++ b/Tools/llm-tool/LLMTool.swift @@ -14,12 +14,7 @@ struct LLMTool: AsyncParsableCommand { defaultSubcommand: SyncGenerator.self) } -struct SyncGenerator: AsyncParsableCommand { - - static var configuration = CommandConfiguration( - commandName: "sync", - abstract: "Synchronous generator" - ) +struct LLMArguments: ParsableArguments { @Option(name: .long, help: "Name of the huggingface model") var model: String = "mlx-community/Mistral-7B-v0.1-hf-4bit-mlx" @@ -36,20 +31,91 @@ struct SyncGenerator: AsyncParsableCommand { @Option(name: .long, help: "The PRNG seed") var seed: UInt64 = 0 - @MainActor - func run() async throws { + @Flag(help: "Show memory stats") + var memoryStats = false + + @Option(name: .long, help: "Maximum cache size in M") + var cacheSize: Int? + + @Option(name: .long, help: "Maximum memory size in M") + var memorySize: Int? + + var startMemory: GPU.Snapshot? + + mutating func load() async throws -> (LLMModel, Tokenizer, ModelConfiguration) { MLXRandom.seed(seed) + if let cacheSize { + GPU.set(cacheLimit: cacheSize * 1024 * 1024) + } + + if let memorySize { + GPU.set(memoryLimit: memorySize * 1024 * 1024) + } + let modelConfiguration = ModelConfiguration.configuration(id: model) - let (model, tokenizer) = try await load(configuration: modelConfiguration) + let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration) - print("Model loaded -> \(self.model)") + startMemory = GPU.snapshot() - let prompt = modelConfiguration.prepare(prompt: self.prompt) + return (model, tokenizer, modelConfiguration) + } + + func tokenizePropmpt(configuration: ModelConfiguration, tokenizer: Tokenizer) -> (String, [Int]) + { + let prompt = configuration.prepare(prompt: self.prompt) let promptTokens = tokenizer.encode(text: prompt) + return (prompt, promptTokens) + } + + func reportMemoryStatistics() { + if memoryStats, let startMemory { + let endMemory = GPU.snapshot() + + print("=======") + print("Memory size: \(GPU.memoryLimit / 1024)K") + print("Cache size: \(GPU.cacheLimit / 1024)K") + + print("") + print("=======") + print("Starting memory") + print(startMemory.description) + + print("") + print("=======") + print("Ending memory") + print(endMemory.description) + + print("") + print("=======") + print("Growth") + print(startMemory.delta(endMemory).description) + + } + } +} + +struct SyncGenerator: AsyncParsableCommand { + + static var configuration = CommandConfiguration( + commandName: "sync", + abstract: "Synchronous generator" + ) + + @OptionGroup var args: LLMArguments + + @MainActor + mutating func run() async throws { + let (model, tokenizer, modelConfiguration) = try await args.load() + + print("Model loaded -> \(modelConfiguration.id)") + + let (prompt, promptTokens) = args.tokenizePropmpt( + configuration: modelConfiguration, tokenizer: tokenizer) + print("Starting generation ...") - print(self.prompt, terminator: "") + print(prompt, terminator: "") var start = Date.timeIntervalSinceReferenceDate var promptTime: TimeInterval = 0 @@ -59,7 +125,8 @@ struct SyncGenerator: AsyncParsableCommand { var tokens = [Int]() var printed = 0 - for token in TokenIterator(prompt: MLXArray(promptTokens), model: model, temp: temperature) + for token in TokenIterator( + prompt: MLXArray(promptTokens), model: model, temp: args.temperature) { if tokens.isEmpty { eval(token) @@ -83,7 +150,7 @@ struct SyncGenerator: AsyncParsableCommand { printed = fullOutput.count - if tokens.count == maxTokens { + if tokens.count == args.maxTokens { break } } @@ -98,6 +165,8 @@ struct SyncGenerator: AsyncParsableCommand { Prompt Tokens per second: \((Double(promptTokens.count) / promptTime).formatted()) Generation tokens per second: \((Double(tokens.count - 1) / generateTime).formatted()) """) + + args.reportMemoryStatistics() } } @@ -112,35 +181,19 @@ struct AsyncGenerator: AsyncParsableCommand { abstract: "async generator" ) - @Option(name: .long, help: "Name of the huggingface model") - var model: String = "mlx-community/Mistral-7B-v0.1-hf-4bit-mlx" - - @Option(name: .shortAndLong, help: "The message to be processed by the model") - var prompt = "compare python and swift" - - @Option(name: .shortAndLong, help: "Maximum number of tokens to generate") - var maxTokens = 100 - - @Option(name: .shortAndLong, help: "The sampling temperature") - var temperature: Float = 0.6 - - @Option(name: .long, help: "The PRNG seed") - var seed: UInt64 = 0 + @OptionGroup var args: LLMArguments @MainActor - func run() async throws { - MLXRandom.seed(seed) + mutating func run() async throws { + let (model, tokenizer, modelConfiguration) = try await args.load() - let modelConfiguration = ModelConfiguration.configuration(id: model) - let (model, tokenizer) = try await load(configuration: modelConfiguration) + print("Model loaded -> \(modelConfiguration.id)") - print("Model loaded -> \(self.model)") - - let prompt = modelConfiguration.prepare(prompt: self.prompt) - let promptTokens = tokenizer.encode(text: prompt) + let (prompt, promptTokens) = args.tokenizePropmpt( + configuration: modelConfiguration, tokenizer: tokenizer) print("Starting generation ...") - print(self.prompt, terminator: "") + print(prompt, terminator: "") var start = Date.timeIntervalSinceReferenceDate var promptTime: TimeInterval = 0 @@ -151,7 +204,7 @@ struct AsyncGenerator: AsyncParsableCommand { var printed = 0 let (task, channel) = generate( - prompt: MLXArray(promptTokens), model: model, temp: temperature) + prompt: MLXArray(promptTokens), model: model, temp: args.temperature) for await token in channel { if tokens.isEmpty { @@ -174,7 +227,7 @@ struct AsyncGenerator: AsyncParsableCommand { printed = fullOutput.count - if tokens.count == maxTokens { + if tokens.count == args.maxTokens { break } } @@ -193,6 +246,8 @@ struct AsyncGenerator: AsyncParsableCommand { Generation tokens per second: \((Double(tokens.count - 1) / generateTime).formatted()) """) + args.reportMemoryStatistics() + // wait for the task to complete -- since it is running async, it might // be in the middle of running the model try? await Task.sleep(for: .milliseconds(500))