use memory limit API (#13)

* add buffer cache limit

* swift-format

* a more reasonable size

* add memory stats to command line tool, update to final api

* add note about changing models
This commit is contained in:
David Koski
2024-03-05 15:22:12 -08:00
committed by GitHub
parent 430b464c8d
commit 61105bf0c4
3 changed files with 115 additions and 39 deletions

View File

@@ -73,6 +73,9 @@ class LLMEvaluator {
func load() async throws -> (LLMModel, LLM.Tokenizer) {
switch loadState {
case .idle:
// limit the buffer cache
MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
let (model, tokenizer) = try await LLM.load(configuration: modelConfiguration) {
[modelConfiguration] progress in
DispatchQueue.main.sync {
@@ -80,6 +83,8 @@ class LLMEvaluator {
"Downloading \(modelConfiguration.id): \(Int(progress.fractionCompleted * 100))%"
}
}
self.output =
"Loaded \(modelConfiguration.id). Weights: \(MLX.GPU.activeMemory / 1024 / 1024)M"
loadState = .loaded(model, tokenizer)
return (model, tokenizer)