From 0199407d93bcb5f6e27732772592c544e3c0e586 Mon Sep 17 00:00:00 2001 From: David Koski <46639364+davidkoski@users.noreply.github.com> Date: Thu, 28 Mar 2024 12:00:52 -0700 Subject: [PATCH] LLMEval performance (#40) * notes about performance and some performance improvements (don't update the display for every token) * swift-format * Update Applications/LLMEval/README.md Co-authored-by: Awni Hannun * Update Applications/LLMEval/README.md Co-authored-by: Awni Hannun --------- Co-authored-by: Awni Hannun --- Applications/LLMEval/ContentView.swift | 25 ++++++++++++++++++++++--- Applications/LLMEval/README.md | 6 ++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/Applications/LLMEval/ContentView.swift b/Applications/LLMEval/ContentView.swift index 6fd9ec5..103c78e 100644 --- a/Applications/LLMEval/ContentView.swift +++ b/Applications/LLMEval/ContentView.swift @@ -152,6 +152,11 @@ class LLMEvaluator { let temperature: Float = 0.6 let maxTokens = 240 + /// update the display every N tokens -- 4 looks like it updates continuously + /// and is low overhead. observed ~15% reduction in tokens/s when updating + /// on every token + let displayEveryNTokens = 4 + enum LoadState { case idle case loaded(LLMModel, Tokenizers.Tokenizer) @@ -198,7 +203,7 @@ class LLMEvaluator { let prompt = modelConfiguration.prepare(prompt: prompt) let promptTokens = MLXArray(tokenizer.encode(text: prompt)) - let initTime = Date() + var initTime = Date() let initDuration = initTime.timeIntervalSince(startTime) await MainActor.run { self.stat = "Init: \(String(format: "%.3f", initDuration))s" @@ -212,6 +217,12 @@ class LLMEvaluator { for token in TokenIterator(prompt: promptTokens, model: model, temp: temperature) { let tokenId = token.item(Int.self) + // to match the measurement from the command line we reset the start time + // after the first token is generated (called the prompt time) + if outputTokens.isEmpty { + initTime = Date() + } + if tokenId == tokenizer.unknownTokenId || tokenId == tokenizer.eosTokenId { break } @@ -220,8 +231,10 @@ class LLMEvaluator { let text = tokenizer.decode(tokens: outputTokens) // update the output -- this will make the view show the text as it generates - await MainActor.run { - self.output = text + if outputTokens.count % displayEveryNTokens == 0 { + await MainActor.run { + self.output = text + } } if outputTokens.count == maxTokens { @@ -232,7 +245,13 @@ class LLMEvaluator { let tokenDuration = Date().timeIntervalSince(initTime) let tokensPerSecond = Double(outputTokens.count) / tokenDuration + // update the text if needed, e.g. we haven't displayed because of displayEveryNTokens + let finalText = tokenizer.decode(tokens: outputTokens) + await MainActor.run { + if finalText != self.output { + self.output = finalText + } running = false self.stat += " Tokens/second: \(String(format: "%.3f", tokensPerSecond))" } diff --git a/Applications/LLMEval/README.md b/Applications/LLMEval/README.md index cb2cf3d..45a2208 100644 --- a/Applications/LLMEval/README.md +++ b/Applications/LLMEval/README.md @@ -50,3 +50,9 @@ Building in Release / optimizations will remove a lot of tail calls in the C++ layer. These lead to the stack overflows. See discussion here: https://github.com/ml-explore/mlx-swift-examples/issues/3 + +### Performance + +Different models have difference performance characteristics. For example Gemma 2B may outperform Phi-2 in terms of tokens / second. + +You may also find that running outside the debugger boosts performance. You can do this in Xcode by pressing cmd-opt-r and unchecking "Debug Executable".