handle partially quantized models (#76)
* handle partially quantized models - fix for #53 #71 #69 #74 - in order to test the models - I added a default prompt of an appropriate form - while working on the model configuration also added additional stop tokens (#74) - fixed the repetitionPenalty code (#71)
This commit is contained in:
@@ -194,6 +194,13 @@ public class LlamaModel: Module, LLMModel {
|
||||
let (out, cache) = model(inputs, cache: cache)
|
||||
return (lmHead(out), cache)
|
||||
}
|
||||
|
||||
public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
|
||||
// Remove unused precomputed rotary freqs
|
||||
weights.filter {
|
||||
!$0.key.contains("self_attn.rotary_emb.inv_freq")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public struct LlamaConfiguration: Codable {
|
||||
|
||||
Reference in New Issue
Block a user