Skip to content

Commit

Permalink
tts : outetts-voc -> wavtokenizer-dec
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Dec 16, 2024
1 parent f1b5b6b commit 985d59f
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 198 deletions.
6 changes: 3 additions & 3 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2032,9 +2032,9 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
yield name, data


@Model.register("OuteTTSVocoder")
class OuteTTSVocoderModel(Model):
model_arch = gguf.MODEL_ARCH.OUTETTS_VOC
@Model.register("WavTokenizerDec")
class WavTokenizerDecModel(Model):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
Expand Down
4 changes: 2 additions & 2 deletions examples/tts/convert_pt_to_hf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the OuteTTSS vocoder
# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder
#
# TODO: this script is LLM-generated and probably very inefficient and should be rewritten

Expand Down Expand Up @@ -144,7 +144,7 @@ def flatten_state_dict(state_dict, parent_key='', sep='.'):

config = {
"architectures": [
"OuteTTSVocoder"
"WavTokenizerDec"
],
"hidden_size": 1282,
"vocab_size": 4096,
Expand Down
214 changes: 107 additions & 107 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,59 +209,59 @@ class GGUFType:


class MODEL_ARCH(IntEnum):
LLAMA = auto()
FALCON = auto()
BAICHUAN = auto()
GROK = auto()
GPT2 = auto()
GPTJ = auto()
GPTNEOX = auto()
MPT = auto()
STARCODER = auto()
REFACT = auto()
BERT = auto()
NOMIC_BERT = auto()
JINA_BERT_V2 = auto()
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
QWEN2 = auto()
QWEN2MOE = auto()
QWEN2VL = auto()
PHI2 = auto()
PHI3 = auto()
PLAMO = auto()
CODESHELL = auto()
ORION = auto()
INTERNLM2 = auto()
MINICPM = auto()
MINICPM3 = auto()
GEMMA = auto()
GEMMA2 = auto()
STARCODER2 = auto()
RWKV6 = auto()
MAMBA = auto()
XVERSE = auto()
COMMAND_R = auto()
DBRX = auto()
OLMO = auto()
OLMO2 = auto()
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
DEEPSEEK = auto()
DEEPSEEK2 = auto()
CHATGLM = auto()
BITNET = auto()
T5 = auto()
T5ENCODER = auto()
JAIS = auto()
NEMOTRON = auto()
EXAONE = auto()
GRANITE = auto()
GRANITE_MOE = auto()
CHAMELEON = auto()
OUTETTS_VOC = auto()
LLAMA = auto()
FALCON = auto()
BAICHUAN = auto()
GROK = auto()
GPT2 = auto()
GPTJ = auto()
GPTNEOX = auto()
MPT = auto()
STARCODER = auto()
REFACT = auto()
BERT = auto()
NOMIC_BERT = auto()
JINA_BERT_V2 = auto()
BLOOM = auto()
STABLELM = auto()
QWEN = auto()
QWEN2 = auto()
QWEN2MOE = auto()
QWEN2VL = auto()
PHI2 = auto()
PHI3 = auto()
PLAMO = auto()
CODESHELL = auto()
ORION = auto()
INTERNLM2 = auto()
MINICPM = auto()
MINICPM3 = auto()
GEMMA = auto()
GEMMA2 = auto()
STARCODER2 = auto()
RWKV6 = auto()
MAMBA = auto()
XVERSE = auto()
COMMAND_R = auto()
DBRX = auto()
OLMO = auto()
OLMO2 = auto()
OLMOE = auto()
OPENELM = auto()
ARCTIC = auto()
DEEPSEEK = auto()
DEEPSEEK2 = auto()
CHATGLM = auto()
BITNET = auto()
T5 = auto()
T5ENCODER = auto()
JAIS = auto()
NEMOTRON = auto()
EXAONE = auto()
GRANITE = auto()
GRANITE_MOE = auto()
CHAMELEON = auto()
WAVTOKENIZER_DEC = auto()


class MODEL_TENSOR(IntEnum):
Expand Down Expand Up @@ -390,59 +390,59 @@ class MODEL_TENSOR(IntEnum):


MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GROK: "grok",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite",
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.OUTETTS_VOC: "outetts-voc",
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN: "baichuan",
MODEL_ARCH.GROK: "grok",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
MODEL_ARCH.MPT: "mpt",
MODEL_ARCH.STARCODER: "starcoder",
MODEL_ARCH.REFACT: "refact",
MODEL_ARCH.BERT: "bert",
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
MODEL_ARCH.BLOOM: "bloom",
MODEL_ARCH.STABLELM: "stablelm",
MODEL_ARCH.QWEN: "qwen",
MODEL_ARCH.QWEN2: "qwen2",
MODEL_ARCH.QWEN2MOE: "qwen2moe",
MODEL_ARCH.QWEN2VL: "qwen2vl",
MODEL_ARCH.PHI2: "phi2",
MODEL_ARCH.PHI3: "phi3",
MODEL_ARCH.PLAMO: "plamo",
MODEL_ARCH.CODESHELL: "codeshell",
MODEL_ARCH.ORION: "orion",
MODEL_ARCH.INTERNLM2: "internlm2",
MODEL_ARCH.MINICPM: "minicpm",
MODEL_ARCH.MINICPM3: "minicpm3",
MODEL_ARCH.GEMMA: "gemma",
MODEL_ARCH.GEMMA2: "gemma2",
MODEL_ARCH.STARCODER2: "starcoder2",
MODEL_ARCH.RWKV6: "rwkv6",
MODEL_ARCH.MAMBA: "mamba",
MODEL_ARCH.XVERSE: "xverse",
MODEL_ARCH.COMMAND_R: "command-r",
MODEL_ARCH.DBRX: "dbrx",
MODEL_ARCH.OLMO: "olmo",
MODEL_ARCH.OLMO2: "olmo2",
MODEL_ARCH.OLMOE: "olmoe",
MODEL_ARCH.OPENELM: "openelm",
MODEL_ARCH.ARCTIC: "arctic",
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
MODEL_ARCH.JAIS: "jais",
MODEL_ARCH.NEMOTRON: "nemotron",
MODEL_ARCH.EXAONE: "exaone",
MODEL_ARCH.GRANITE: "granite",
MODEL_ARCH.GRANITE_MOE: "granitemoe",
MODEL_ARCH.CHAMELEON: "chameleon",
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
}

TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
Expand Down Expand Up @@ -1406,7 +1406,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
],
MODEL_ARCH.OUTETTS_VOC: [
MODEL_ARCH.WAVTOKENIZER_DEC: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.TOKEN_EMBD_NORM,
MODEL_TENSOR.CONV1D,
Expand Down
36 changes: 18 additions & 18 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class TensorNameMap:
"emb_ln", # nomic-bert
"transformer.norm", # openelm
"rwkv.blocks.0.pre_ln", # rwkv
"backbone.norm", # outetts
"backbone.norm", # wavtokenizer
),

# Position embeddings
Expand All @@ -61,7 +61,7 @@ class TensorNameMap:
"lm_head.linear", # phi2
"output_layer", # chatglm
"head", # rwkv
"head.out", # outetts
"head.out", # wavtokenizer
),

# Output norm
Expand All @@ -82,7 +82,7 @@ class TensorNameMap:
"transformer.norm", # openelm
"model.norm", # nemotron
"rwkv.ln_out", # rwkv
"backbone.final_layer_norm", # outetts
"backbone.final_layer_norm", # wavtokenizer
),

# Rope frequencies
Expand Down Expand Up @@ -705,63 +705,63 @@ class TensorNameMap:
#############################################################################

MODEL_TENSOR.CONV_NEXT_DW: (
"backbone.convnext.{bid}.dwconv", # outetts
"backbone.convnext.{bid}.dwconv", # wavtokenizer
),

MODEL_TENSOR.CONV_NEXT_NORM: (
"backbone.convnext.{bid}.norm", # outetts
"backbone.convnext.{bid}.norm", # wavtokenizer
),

MODEL_TENSOR.CONV_NEXT_PW1: (
"backbone.convnext.{bid}.pwconv1", # outetts
"backbone.convnext.{bid}.pwconv1", # wavtokenizer
),

MODEL_TENSOR.CONV_NEXT_PW2: (
"backbone.convnext.{bid}.pwconv2", # outetts
"backbone.convnext.{bid}.pwconv2", # wavtokenizer
),

MODEL_TENSOR.CONV_NEXT_GAMMA: (
"backbone.convnext.{bid}.gamma", # outetts
"backbone.convnext.{bid}.gamma", # wavtokenizer
),

MODEL_TENSOR.POS_NET_CONV1: (
"backbone.pos_net.{bid}.conv1", # outetts
"backbone.pos_net.{bid}.conv1", # wavtokenizer
),

MODEL_TENSOR.POS_NET_CONV2: (
"backbone.pos_net.{bid}.conv2", # outetts
"backbone.pos_net.{bid}.conv2", # wavtokenizer
),

MODEL_TENSOR.POS_NET_NORM: (
"backbone.pos_net.{bid}.norm", # outetts
"backbone.pos_net.{bid}.norm", # wavtokenizer
),

MODEL_TENSOR.POS_NET_NORM1: (
"backbone.pos_net.{bid}.norm1", # outetts
"backbone.pos_net.{bid}.norm1", # wavtokenizer
),

MODEL_TENSOR.POS_NET_NORM2: (
"backbone.pos_net.{bid}.norm2", # outetts
"backbone.pos_net.{bid}.norm2", # wavtokenizer
),

MODEL_TENSOR.POS_NET_ATTN_NORM: (
"backbone.pos_net.{bid}.norm", # outetts
"backbone.pos_net.{bid}.norm", # wavtokenizer
),

MODEL_TENSOR.POS_NET_ATTN_Q: (
"backbone.pos_net.{bid}.q", # outetts
"backbone.pos_net.{bid}.q", # wavtokenizer
),

MODEL_TENSOR.POS_NET_ATTN_K: (
"backbone.pos_net.{bid}.k", # outetts
"backbone.pos_net.{bid}.k", # wavtokenizer
),

MODEL_TENSOR.POS_NET_ATTN_V: (
"backbone.pos_net.{bid}.v", # outetts
"backbone.pos_net.{bid}.v", # wavtokenizer
),

MODEL_TENSOR.POS_NET_ATTN_OUT: (
"backbone.pos_net.{bid}.proj_out", # outetts
"backbone.pos_net.{bid}.proj_out", # wavtokenizer
),
}

Expand Down
Loading

0 comments on commit 985d59f

Please sign in to comment.