From 0199407d93bcb5f6e27732772592c544e3c0e586 Mon Sep 17 00:00:00 2001
From: David Koski <46639364+davidkoski@users.noreply.github.com>
Date: Thu, 28 Mar 2024 12:00:52 -0700
Subject: [PATCH] LLMEval performance (#40)

* notes about performance and some performance improvements (don't update the display for every token)

* swift-format

* Update Applications/LLMEval/README.md

Co-authored-by: Awni Hannun <awni.hannun@gmail.com>

* Update Applications/LLMEval/README.md

Co-authored-by: Awni Hannun <awni.hannun@gmail.com>

---------

Co-authored-by: Awni Hannun <awni.hannun@gmail.com>
---
 Applications/LLMEval/ContentView.swift | 25 ++++++++++++++++++++++---
 Applications/LLMEval/README.md         |  6 ++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/Applications/LLMEval/ContentView.swift b/Applications/LLMEval/ContentView.swift
index 6fd9ec5..103c78e 100644
--- a/Applications/LLMEval/ContentView.swift
+++ b/Applications/LLMEval/ContentView.swift
@@ -152,6 +152,11 @@ class LLMEvaluator {
     let temperature: Float = 0.6
     let maxTokens = 240
 
+    /// update the display every N tokens -- 4 looks like it updates continuously
+    /// and is low overhead.  observed ~15% reduction in tokens/s when updating
+    /// on every token
+    let displayEveryNTokens = 4
+
     enum LoadState {
         case idle
         case loaded(LLMModel, Tokenizers.Tokenizer)
@@ -198,7 +203,7 @@ class LLMEvaluator {
             let prompt = modelConfiguration.prepare(prompt: prompt)
             let promptTokens = MLXArray(tokenizer.encode(text: prompt))
 
-            let initTime = Date()
+            var initTime = Date()
             let initDuration = initTime.timeIntervalSince(startTime)
             await MainActor.run {
                 self.stat = "Init: \(String(format: "%.3f", initDuration))s"
@@ -212,6 +217,12 @@ class LLMEvaluator {
             for token in TokenIterator(prompt: promptTokens, model: model, temp: temperature) {
                 let tokenId = token.item(Int.self)
 
+                // to match the measurement from the command line we reset the start time
+                // after the first token is generated (called the prompt time)
+                if outputTokens.isEmpty {
+                    initTime = Date()
+                }
+
                 if tokenId == tokenizer.unknownTokenId || tokenId == tokenizer.eosTokenId {
                     break
                 }
@@ -220,8 +231,10 @@ class LLMEvaluator {
                 let text = tokenizer.decode(tokens: outputTokens)
 
                 // update the output -- this will make the view show the text as it generates
-                await MainActor.run {
-                    self.output = text
+                if outputTokens.count % displayEveryNTokens == 0 {
+                    await MainActor.run {
+                        self.output = text
+                    }
                 }
 
                 if outputTokens.count == maxTokens {
@@ -232,7 +245,13 @@ class LLMEvaluator {
             let tokenDuration = Date().timeIntervalSince(initTime)
             let tokensPerSecond = Double(outputTokens.count) / tokenDuration
 
+            // update the text if needed, e.g. we haven't displayed because of displayEveryNTokens
+            let finalText = tokenizer.decode(tokens: outputTokens)
+
             await MainActor.run {
+                if finalText != self.output {
+                    self.output = finalText
+                }
                 running = false
                 self.stat += " Tokens/second: \(String(format: "%.3f", tokensPerSecond))"
             }
diff --git a/Applications/LLMEval/README.md b/Applications/LLMEval/README.md
index cb2cf3d..45a2208 100644
--- a/Applications/LLMEval/README.md
+++ b/Applications/LLMEval/README.md
@@ -50,3 +50,9 @@ Building in Release / optimizations will remove a lot of tail calls in the C++
 layer.  These lead to the stack overflows.
 
 See discussion here: https://github.com/ml-explore/mlx-swift-examples/issues/3
+
+### Performance
+
+Different models have difference performance characteristics. For example Gemma 2B may outperform Phi-2 in terms of tokens / second.
+
+You may also find that running outside the debugger boosts performance.  You can do this in Xcode by pressing cmd-opt-r and unchecking "Debug Executable".