handle partially quantized models (#76)

* handle partially quantized models

- fix for #53 #71 #69 #74
- in order to test the models
	- I added a default prompt of an appropriate form
	- while working on the model configuration also added additional stop tokens (#74)
- fixed the repetitionPenalty code (#71)
This commit is contained in:
David Koski
2024-05-28 16:35:11 -07:00
committed by GitHub
parent 65f4968e5f
commit 9d74afd119
12 changed files with 139 additions and 67 deletions

View File

@@ -194,6 +194,13 @@ public class LlamaModel: Module, LLMModel {
let (out, cache) = model(inputs, cache: cache)
return (lmHead(out), cache)
}
public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
// Remove unused precomputed rotary freqs
weights.filter {
!$0.key.contains("self_attn.rotary_emb.inv_freq")
}
}
}
public struct LlamaConfiguration: Codable {