|
12 | 12 | # See the License for the specific language governing permissions and |
13 | 13 | # limitations under the License. |
14 | 14 |
|
15 | | -"""Qwen3.5-VL finetuning recipes. |
| 15 | +"""Qwen3.5-VL recipes. |
16 | 16 |
|
17 | | -This module provides SFT and PEFT configurations for Qwen3.5-VL models: |
| 17 | +This module provides pretrain, SFT, and PEFT configurations for Qwen3.5-VL models: |
18 | 18 |
|
19 | 19 | - **Dense**: 800M, 2B, 4B, 9B, 27B |
20 | 20 | - **MoE**: 35B-A3B, 122B-A10B, 397B-A17B |
21 | 21 | """ |
22 | 22 |
|
| 23 | +from __future__ import annotations |
| 24 | + |
23 | 25 | import torch |
| 26 | +from typing_extensions import Unpack |
24 | 27 |
|
25 | 28 | from megatron.bridge import AutoBridge |
26 | 29 | from megatron.bridge.peft.base import PEFT |
27 | 30 | from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm |
| 31 | +from megatron.bridge.recipes.qwen_vl.qwen3_vl import Qwen3VLCommonKwargs, _qwen3_vl_common |
28 | 32 | from megatron.bridge.recipes.utils.finetune_utils import default_peft_config |
29 | 33 | from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing |
30 | 34 | from megatron.bridge.training.config import ConfigContainer |
@@ -182,6 +186,90 @@ def _qwen35_vl_apply_peft_scheme(cfg: ConfigContainer, peft_scheme: str | PEFT) |
182 | 186 | cfg.peft = peft_scheme |
183 | 187 |
|
184 | 188 |
|
| 189 | +# ============================================================================= |
| 190 | +# Qwen3.5-VL Pretrain Configurations (mock dataset) |
| 191 | +# ============================================================================= |
| 192 | +# Qwen3.5-VL reuses the Qwen3-VL _qwen3_vl_common helper for pretrain configs |
| 193 | +# since both families share the same VLM architecture and mock-dataset pipeline. |
| 194 | + |
| 195 | + |
| 196 | +def qwen35_vl_9b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: |
| 197 | + """Return a pre-training config for Qwen3.5-VL 9B (dense). |
| 198 | +
|
| 199 | + See `_qwen3_vl_common` for the full list of parameters. |
| 200 | + """ |
| 201 | + recommended_kwargs: Qwen3VLCommonKwargs = { |
| 202 | + "hf_path": "Qwen/Qwen3.5-9B", |
| 203 | + "tensor_model_parallel_size": 4, |
| 204 | + "pipeline_model_parallel_size": 1, |
| 205 | + "expert_model_parallel_size": 1, |
| 206 | + "freeze_language_model": True, |
| 207 | + "freeze_vision_model": True, |
| 208 | + "freeze_vision_projection": False, |
| 209 | + } |
| 210 | + combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} |
| 211 | + return _qwen3_vl_common(**combined_kwargs) |
| 212 | + |
| 213 | + |
| 214 | +def qwen35_vl_35b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: |
| 215 | + """Return a pre-training config for Qwen3.5-VL 35B-A3B (MoE). |
| 216 | +
|
| 217 | + See `_qwen3_vl_common` for the full list of parameters. |
| 218 | + """ |
| 219 | + recommended_kwargs: Qwen3VLCommonKwargs = { |
| 220 | + "hf_path": "Qwen/Qwen3.5-35B-A3B", |
| 221 | + "tensor_model_parallel_size": 4, |
| 222 | + "pipeline_model_parallel_size": 2, |
| 223 | + "expert_model_parallel_size": 4, |
| 224 | + "sequence_parallel": True, |
| 225 | + "freeze_language_model": True, |
| 226 | + "freeze_vision_model": True, |
| 227 | + "freeze_vision_projection": False, |
| 228 | + } |
| 229 | + combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} |
| 230 | + return _qwen3_vl_common(**combined_kwargs) |
| 231 | + |
| 232 | + |
| 233 | +def qwen35_vl_122b_a10b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: |
| 234 | + """Return a pre-training config for Qwen3.5-VL 122B-A10B (MoE). |
| 235 | +
|
| 236 | + See `_qwen3_vl_common` for the full list of parameters. |
| 237 | + """ |
| 238 | + recommended_kwargs: Qwen3VLCommonKwargs = { |
| 239 | + "hf_path": "Qwen/Qwen3.5-122B-A10B", |
| 240 | + "tensor_model_parallel_size": 4, |
| 241 | + "pipeline_model_parallel_size": 8, |
| 242 | + "expert_model_parallel_size": 8, |
| 243 | + "context_parallel_size": 2, |
| 244 | + "sequence_parallel": True, |
| 245 | + "freeze_language_model": True, |
| 246 | + "freeze_vision_model": True, |
| 247 | + "freeze_vision_projection": False, |
| 248 | + } |
| 249 | + combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} |
| 250 | + return _qwen3_vl_common(**combined_kwargs) |
| 251 | + |
| 252 | + |
| 253 | +def qwen35_vl_397b_a17b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer: |
| 254 | + """Return a pre-training config for Qwen3.5-VL 397B-A17B (MoE). |
| 255 | +
|
| 256 | + See `_qwen3_vl_common` for the full list of parameters. |
| 257 | + """ |
| 258 | + recommended_kwargs: Qwen3VLCommonKwargs = { |
| 259 | + "hf_path": "Qwen/Qwen3.5-397B-A17B", |
| 260 | + "tensor_model_parallel_size": 4, |
| 261 | + "pipeline_model_parallel_size": 16, |
| 262 | + "expert_model_parallel_size": 16, |
| 263 | + "context_parallel_size": 2, |
| 264 | + "sequence_parallel": True, |
| 265 | + "freeze_language_model": True, |
| 266 | + "freeze_vision_model": True, |
| 267 | + "freeze_vision_projection": False, |
| 268 | + } |
| 269 | + combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs} |
| 270 | + return _qwen3_vl_common(**combined_kwargs) |
| 271 | + |
| 272 | + |
185 | 273 | # ============================================================================= |
186 | 274 | # Qwen3.5-VL Dense SFT Configurations (800M, 2B, 4B, 9B, 27B) |
187 | 275 | # ============================================================================= |
|
0 commit comments