14b conversion

miguelmartin75 · miguelmartin75 · commit 3e147ede76dc · 2025-12-20T07:14:26.000Z
diff --git a/scripts/convert_cosmos_to_diffusers.py b/scripts/convert_cosmos_to_diffusers.py
@@ -29,13 +29,52 @@
 
 Convert checkpoint
 ```bash
+# pre-trained
 transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/pre-trained/d20b7120-df3e-4911-919d-db6e08bad31c_ema_bf16.pt
 
 python scripts/convert_cosmos_to_diffusers.py \
     --transformer_type Cosmos-2.5-Predict-Base-2B \
     --transformer_ckpt_path $transformer_ckpt_path \
     --vae_type wan2.1 \
-    --output_path converted/cosmos-p2.5-base-2b \
+    --output_path converted/2b/d20b7120-df3e-4911-919d-db6e08bad31c \
+    --save_pipeline
+
+# post-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-2B/snapshots/865baf084d4c9e850eac59a021277d5a9b9e8b63/base/post-trained/81edfebe-bd6a-4039-8c1d-737df1a790bf_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-2B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/2b/81edfebe-bd6a-4039-8c1d-737df1a790bf \
+    --save_pipeline
+```
+
+## 14B
+
+```bash
+hf download nvidia/Cosmos-Predict2.5-14B
+```
+
+```bash
+# pre-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/03eb354f35eae0d6e0c1be3c9f94d8551e125570/base/pre-trained/54937b8c-29de-4f04-862c-e67b04ec41e8_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-14B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/14b/54937b8c-29de-4f04-862c-e67b04ec41e8/ \
+    --save_pipeline
+
+# post-trained
+transformer_ckpt_path=~/.cache/huggingface/hub/models--nvidia--Cosmos-Predict2.5-14B/snapshots/03eb354f35eae0d6e0c1be3c9f94d8551e125570/base/pre-trained/e21d2a49-4747-44c8-ba44-9f6f9243715f_ema_bf16.pt
+
+python scripts/convert_cosmos_to_diffusers.py \
+    --transformer_type Cosmos-2.5-Predict-Base-14B \
+    --transformer_ckpt_path $transformer_ckpt_path \
+    --vae_type wan2.1 \
+    --output_path converted/14b/e21d2a49-4747-44c8-ba44-9f6f9243715f/ \
     --save_pipeline
 ```
 
@@ -298,6 +337,25 @@ def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
         "crossattn_proj_in_channels": 100352,
         "encoder_hidden_states_channels": 1024,
     },
+    "Cosmos-2.5-Predict-Base-14B": {
+        "in_channels": 16 + 1,
+        "out_channels": 16,
+        "num_attention_heads": 40,
+        "attention_head_dim": 128,
+        "num_layers": 36,
+        "mlp_ratio": 4.0,
+        "text_embed_dim": 1024,
+        "adaln_lora_dim": 256,
+        "max_size": (128, 240, 240),
+        "patch_size": (1, 2, 2),
+        "rope_scale": (1.0, 3.0, 3.0),
+        "concat_padding_mask": True,
+        # NOTE: source config has pos_emb_learnable: 'True' - but params are missing
+        "extra_pos_embed_type": None,
+        "use_crossattn_projection": True,
+        "crossattn_proj_in_channels": 100352,
+        "encoder_hidden_states_channels": 1024,
+    },
 }
 
 VAE_KEYS_RENAME_DICT = {
diff --git a/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py b/src/diffusers/pipelines/cosmos/pipeline_cosmos2_5_predict.py
@@ -133,7 +133,7 @@ def retrieve_latents(
         ...     num_frames=93,
         ...     generator=torch.Generator().manual_seed(1),
         ... ).frames[0]
-        >>> # export_to_video(video, "image2world.mp4", fps=16)
+        >>> export_to_video(video, "image2world.mp4", fps=16)
 
         >>> # Video2World: condition on an input clip and predict a 93-frame world video.
         >>> prompt = (