Remove Tokenizer wrapper (#28)
* Remove Tokenizer wrapper Depends on https://github.com/huggingface/swift-transformers/pull/70 * Update swift-transformers revision
This commit is contained in:
@@ -4,51 +4,6 @@ import Foundation
|
|||||||
import Hub
|
import Hub
|
||||||
import Tokenizers
|
import Tokenizers
|
||||||
|
|
||||||
/// Wrapper for `Tokenizers.Tokenizer` that provides access to config
|
|
||||||
/// like ``eosToken``.
|
|
||||||
public struct Tokenizer: Tokenizers.Tokenizer {
|
|
||||||
|
|
||||||
let tokenizer: Tokenizers.Tokenizer
|
|
||||||
|
|
||||||
public let eosToken: String?
|
|
||||||
public let eosTokenId: Int?
|
|
||||||
|
|
||||||
internal init(tokenizer: Tokenizers.Tokenizer, tokenizerConfig: Config) {
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.eosToken = tokenizerConfig.eosToken?.stringValue
|
|
||||||
if let eosToken {
|
|
||||||
self.eosTokenId = tokenizer.convertTokenToId(eosToken)
|
|
||||||
} else {
|
|
||||||
self.eosTokenId = nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public func tokenize(text: String) -> [String] {
|
|
||||||
tokenizer.tokenize(text: text)
|
|
||||||
}
|
|
||||||
|
|
||||||
public func encode(text: String) -> [Int] {
|
|
||||||
tokenizer.encode(text: text)
|
|
||||||
}
|
|
||||||
|
|
||||||
public func decode(tokens: [Int]) -> String {
|
|
||||||
tokenizer.decode(tokens: tokens)
|
|
||||||
}
|
|
||||||
|
|
||||||
public func convertTokenToId(_ token: String) -> Int? {
|
|
||||||
tokenizer.convertTokenToId(token)
|
|
||||||
}
|
|
||||||
|
|
||||||
public func convertIdToToken(_ id: Int) -> String? {
|
|
||||||
tokenizer.convertIdToToken(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
public var unknownToken: String? { tokenizer.unknownToken }
|
|
||||||
|
|
||||||
public var unknownTokenId: Int? { tokenizer.unknownTokenId }
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tokenizer {
|
public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tokenizer {
|
||||||
// from AutoTokenizer.from() -- this lets us override parts of the configuration
|
// from AutoTokenizer.from() -- this lets us override parts of the configuration
|
||||||
let config = LanguageModelConfigurationFromHub(
|
let config = LanguageModelConfigurationFromHub(
|
||||||
@@ -56,7 +11,7 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok
|
|||||||
guard var tokenizerConfig = try await config.tokenizerConfig else {
|
guard var tokenizerConfig = try await config.tokenizerConfig else {
|
||||||
throw LLMError(message: "missing config")
|
throw LLMError(message: "missing config")
|
||||||
}
|
}
|
||||||
var tokenizerData = try await config.tokenizerData
|
let tokenizerData = try await config.tokenizerData
|
||||||
|
|
||||||
// workaround: replacement tokenizers for unhandled values in swift-transform
|
// workaround: replacement tokenizers for unhandled values in swift-transform
|
||||||
if let tokenizerClass = tokenizerConfig.tokenizerClass?.stringValue,
|
if let tokenizerClass = tokenizerConfig.tokenizerClass?.stringValue,
|
||||||
@@ -67,10 +22,8 @@ public func loadTokenizer(configuration: ModelConfiguration) async throws -> Tok
|
|||||||
tokenizerConfig = Config(dictionary)
|
tokenizerConfig = Config(dictionary)
|
||||||
}
|
}
|
||||||
|
|
||||||
let impl = try PreTrainedTokenizer(
|
return try PreTrainedTokenizer(
|
||||||
tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
|
tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
|
||||||
|
|
||||||
return Tokenizer(tokenizer: impl, tokenizerConfig: tokenizerConfig)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// overrides for TokenizerModel/knownTokenizers
|
/// overrides for TokenizerModel/knownTokenizers
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import Foundation
|
|||||||
import LLM
|
import LLM
|
||||||
import MLX
|
import MLX
|
||||||
import MLXRandom
|
import MLXRandom
|
||||||
|
import Tokenizers
|
||||||
|
|
||||||
@main
|
@main
|
||||||
struct LLMTool: AsyncParsableCommand {
|
struct LLMTool: AsyncParsableCommand {
|
||||||
|
|||||||
@@ -60,7 +60,7 @@
|
|||||||
"location" : "https://github.com/huggingface/swift-transformers",
|
"location" : "https://github.com/huggingface/swift-transformers",
|
||||||
"state" : {
|
"state" : {
|
||||||
"branch" : "main",
|
"branch" : "main",
|
||||||
"revision" : "24605a8c0cc974bec5b94a6752eb687bae77db31"
|
"revision" : "3bd02269b7797ade67c15679a575cd5c6f203ce6"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user