switch swift-tokenizers to main, remove some workarounds (#26)
* switch swift-tokenizers to main, remove some workarounds - swift-tokenizers is getting a lot of updates and fixes, let's track main for now - remove some workarounds that are no longer needed - https://github.com/huggingface/swift-transformers/issues/63
This commit is contained in:
@@ -67,54 +67,13 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok
|
|||||||
tokenizerConfig = Config(dictionary)
|
tokenizerConfig = Config(dictionary)
|
||||||
}
|
}
|
||||||
|
|
||||||
// workaround: some merges can't be split on space in BPETokenizer
|
|
||||||
if let tokenizerClass = tokenizerConfig.tokenizerClass?.stringValue {
|
|
||||||
switch tokenizerClass {
|
|
||||||
case "T5Tokenizer":
|
|
||||||
break
|
|
||||||
default:
|
|
||||||
tokenizerData = discardUnhandledMerges(tokenizerData: tokenizerData)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let impl = try PreTrainedTokenizer(
|
let impl = try PreTrainedTokenizer(
|
||||||
tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
|
tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
|
||||||
|
|
||||||
return Tokenizer(tokenizer: impl, tokenizerConfig: tokenizerConfig)
|
return Tokenizer(tokenizer: impl, tokenizerConfig: tokenizerConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
public func discardUnhandledMerges(tokenizerData: Config) -> Config {
|
|
||||||
// see https://github.com/ml-explore/mlx-swift-examples/issues/1
|
|
||||||
// and https://github.com/huggingface/swift-transformers/issues/51
|
|
||||||
|
|
||||||
if let model = tokenizerData.model {
|
|
||||||
if let merges = model.dictionary["merges"] as? [String] {
|
|
||||||
// discard any merges that can't be split on a space
|
|
||||||
// (required by BPETokenizer)
|
|
||||||
let newMerges =
|
|
||||||
merges
|
|
||||||
.filter {
|
|
||||||
$0.split(separator: " ").count == 2
|
|
||||||
}
|
|
||||||
|
|
||||||
if newMerges.count != merges.count {
|
|
||||||
var newModel = model.dictionary
|
|
||||||
newModel["merges"] = newMerges
|
|
||||||
|
|
||||||
var newTokenizerData = tokenizerData.dictionary
|
|
||||||
newTokenizerData["model"] = newModel
|
|
||||||
|
|
||||||
return Config(newTokenizerData)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokenizerData
|
|
||||||
}
|
|
||||||
|
|
||||||
/// overrides for TokenizerModel/knownTokenizers
|
/// overrides for TokenizerModel/knownTokenizers
|
||||||
let replacementTokenizers = [
|
let replacementTokenizers = [
|
||||||
"CodeLlamaTokenizer": "LlamaTokenizer",
|
"Qwen2Tokenizer": "PreTrainedTokenizer"
|
||||||
"GemmaTokenizer": "PreTrainedTokenizer",
|
|
||||||
"Qwen2Tokenizer": "PreTrainedTokenizer",
|
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2220,8 +2220,8 @@
|
|||||||
isa = XCRemoteSwiftPackageReference;
|
isa = XCRemoteSwiftPackageReference;
|
||||||
repositoryURL = "https://github.com/huggingface/swift-transformers";
|
repositoryURL = "https://github.com/huggingface/swift-transformers";
|
||||||
requirement = {
|
requirement = {
|
||||||
kind = upToNextMajorVersion;
|
branch = main;
|
||||||
minimumVersion = 0.1.2;
|
kind = branch;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
C392736E2B60699100368D5D /* XCRemoteSwiftPackageReference "swift-argument-parser" */ = {
|
C392736E2B60699100368D5D /* XCRemoteSwiftPackageReference "swift-argument-parser" */ = {
|
||||||
|
|||||||
@@ -59,8 +59,8 @@
|
|||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
"location" : "https://github.com/huggingface/swift-transformers",
|
"location" : "https://github.com/huggingface/swift-transformers",
|
||||||
"state" : {
|
"state" : {
|
||||||
"revision" : "564442fba36b0b694d730a62d0593e5f54043b55",
|
"branch" : "main",
|
||||||
"version" : "0.1.2"
|
"revision" : "24605a8c0cc974bec5b94a6752eb687bae77db31"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user