Skip to content

Commit 696fd7f

Browse files
cuichenxclaude
andcommitted
[recipe] Add pretrain configs with mock dataset for Qwen3-VL and Qwen3.5-VL
Add pretrain recipe configs using MockVLMConversationProvider for VLM pre-training with synthetic data. This restores previously deleted pretrain configs and extends coverage to Qwen3.5-VL. Qwen3-VL: 8B, 30B-A3B, 235B-A22B Qwen3.5-VL: 9B, 35B-A3B, 122B-A10B, 397B-A17B The configs use a shared _qwen3_vl_common helper with a Qwen3VLCommonKwargs TypedDict for type-safe overrides. Existing perf scripts that import qwen3_vl_30b_a3b_pretrain_config / qwen3_vl_235b_a22b_pretrain_config continue to work without changes. Co-Authored-By: Claude Opus 4.6 (1M context) <[email protected]> Signed-off-by: Chen Cui <[email protected]>
1 parent 31d2b86 commit 696fd7f

3 files changed

Lines changed: 343 additions & 5 deletions

File tree

src/megatron/bridge/recipes/qwen_vl/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,13 @@
1717
from .qwen3_vl import (
1818
qwen3_vl_8b_peft_config,
1919
qwen3_vl_8b_peft_energon_config,
20+
qwen3_vl_8b_pretrain_config,
2021
qwen3_vl_8b_sft_config,
2122
qwen3_vl_30b_a3b_peft_config,
23+
qwen3_vl_30b_a3b_pretrain_config,
2224
qwen3_vl_30b_a3b_sft_config,
2325
qwen3_vl_235b_a22b_peft_config,
26+
qwen3_vl_235b_a22b_pretrain_config,
2427
qwen3_vl_235b_a22b_sft_config,
2528
)
2629
from .qwen25_vl import (
@@ -41,22 +44,31 @@
4144
qwen35_vl_4b_peft_config,
4245
qwen35_vl_4b_sft_config,
4346
qwen35_vl_9b_peft_config,
47+
qwen35_vl_9b_pretrain_config,
4448
qwen35_vl_9b_sft_config,
4549
qwen35_vl_27b_peft_config,
4650
qwen35_vl_27b_sft_config,
4751
qwen35_vl_35b_a3b_fsdp_sft_config,
4852
qwen35_vl_35b_a3b_peft_config,
53+
qwen35_vl_35b_a3b_pretrain_config,
4954
qwen35_vl_35b_a3b_sft_config,
5055
qwen35_vl_122b_a10b_peft_config,
56+
qwen35_vl_122b_a10b_pretrain_config,
5157
qwen35_vl_122b_a10b_sft_config,
5258
qwen35_vl_397b_a17b_peft_config,
59+
qwen35_vl_397b_a17b_pretrain_config,
5360
qwen35_vl_397b_a17b_sft_config,
5461
qwen35_vl_800m_peft_config,
5562
qwen35_vl_800m_sft_config,
5663
)
5764

5865

5966
__all__ = [
67+
# Qwen3.5-VL pretrain configs
68+
"qwen35_vl_9b_pretrain_config",
69+
"qwen35_vl_35b_a3b_pretrain_config",
70+
"qwen35_vl_122b_a10b_pretrain_config",
71+
"qwen35_vl_397b_a17b_pretrain_config",
6072
# Qwen3.5-VL SFT configs — dense
6173
"qwen35_vl_800m_sft_config",
6274
"qwen35_vl_2b_sft_config",
@@ -88,6 +100,10 @@
88100
"qwen25_vl_7b_peft_config",
89101
"qwen25_vl_32b_peft_config",
90102
"qwen25_vl_72b_peft_config",
103+
# Qwen3-VL pretrain configs
104+
"qwen3_vl_8b_pretrain_config",
105+
"qwen3_vl_30b_a3b_pretrain_config",
106+
"qwen3_vl_235b_a22b_pretrain_config",
91107
# Qwen3-VL SFT configs
92108
"qwen3_vl_8b_sft_config",
93109
"qwen3_vl_30b_a3b_sft_config",

src/megatron/bridge/recipes/qwen_vl/qwen35_vl.py

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,23 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
"""Qwen3.5-VL finetuning recipes.
15+
"""Qwen3.5-VL recipes.
1616
17-
This module provides SFT and PEFT configurations for Qwen3.5-VL models:
17+
This module provides pretrain, SFT, and PEFT configurations for Qwen3.5-VL models:
1818
1919
- **Dense**: 800M, 2B, 4B, 9B, 27B
2020
- **MoE**: 35B-A3B, 122B-A10B, 397B-A17B
2121
"""
2222

23+
from __future__ import annotations
24+
2325
import torch
26+
from typing_extensions import Unpack
2427

2528
from megatron.bridge import AutoBridge
2629
from megatron.bridge.peft.base import PEFT
2730
from megatron.bridge.recipes.common import _peft_common_vlm, _sft_common_vlm
31+
from megatron.bridge.recipes.qwen_vl.qwen3_vl import Qwen3VLCommonKwargs, _qwen3_vl_common
2832
from megatron.bridge.recipes.utils.finetune_utils import default_peft_config
2933
from megatron.bridge.recipes.utils.optimizer_utils import distributed_fused_adam_with_cosine_annealing
3034
from megatron.bridge.training.config import ConfigContainer
@@ -182,6 +186,90 @@ def _qwen35_vl_apply_peft_scheme(cfg: ConfigContainer, peft_scheme: str | PEFT)
182186
cfg.peft = peft_scheme
183187

184188

189+
# =============================================================================
190+
# Qwen3.5-VL Pretrain Configurations (mock dataset)
191+
# =============================================================================
192+
# Qwen3.5-VL reuses the Qwen3-VL _qwen3_vl_common helper for pretrain configs
193+
# since both families share the same VLM architecture and mock-dataset pipeline.
194+
195+
196+
def qwen35_vl_9b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
197+
"""Return a pre-training config for Qwen3.5-VL 9B (dense).
198+
199+
See `_qwen3_vl_common` for the full list of parameters.
200+
"""
201+
recommended_kwargs: Qwen3VLCommonKwargs = {
202+
"hf_path": "Qwen/Qwen3.5-9B",
203+
"tensor_model_parallel_size": 4,
204+
"pipeline_model_parallel_size": 1,
205+
"expert_model_parallel_size": 1,
206+
"freeze_language_model": True,
207+
"freeze_vision_model": True,
208+
"freeze_vision_projection": False,
209+
}
210+
combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
211+
return _qwen3_vl_common(**combined_kwargs)
212+
213+
214+
def qwen35_vl_35b_a3b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
215+
"""Return a pre-training config for Qwen3.5-VL 35B-A3B (MoE).
216+
217+
See `_qwen3_vl_common` for the full list of parameters.
218+
"""
219+
recommended_kwargs: Qwen3VLCommonKwargs = {
220+
"hf_path": "Qwen/Qwen3.5-35B-A3B",
221+
"tensor_model_parallel_size": 4,
222+
"pipeline_model_parallel_size": 2,
223+
"expert_model_parallel_size": 4,
224+
"sequence_parallel": True,
225+
"freeze_language_model": True,
226+
"freeze_vision_model": True,
227+
"freeze_vision_projection": False,
228+
}
229+
combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
230+
return _qwen3_vl_common(**combined_kwargs)
231+
232+
233+
def qwen35_vl_122b_a10b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
234+
"""Return a pre-training config for Qwen3.5-VL 122B-A10B (MoE).
235+
236+
See `_qwen3_vl_common` for the full list of parameters.
237+
"""
238+
recommended_kwargs: Qwen3VLCommonKwargs = {
239+
"hf_path": "Qwen/Qwen3.5-122B-A10B",
240+
"tensor_model_parallel_size": 4,
241+
"pipeline_model_parallel_size": 8,
242+
"expert_model_parallel_size": 8,
243+
"context_parallel_size": 2,
244+
"sequence_parallel": True,
245+
"freeze_language_model": True,
246+
"freeze_vision_model": True,
247+
"freeze_vision_projection": False,
248+
}
249+
combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
250+
return _qwen3_vl_common(**combined_kwargs)
251+
252+
253+
def qwen35_vl_397b_a17b_pretrain_config(**user_kwargs: Unpack[Qwen3VLCommonKwargs]) -> ConfigContainer:
254+
"""Return a pre-training config for Qwen3.5-VL 397B-A17B (MoE).
255+
256+
See `_qwen3_vl_common` for the full list of parameters.
257+
"""
258+
recommended_kwargs: Qwen3VLCommonKwargs = {
259+
"hf_path": "Qwen/Qwen3.5-397B-A17B",
260+
"tensor_model_parallel_size": 4,
261+
"pipeline_model_parallel_size": 16,
262+
"expert_model_parallel_size": 16,
263+
"context_parallel_size": 2,
264+
"sequence_parallel": True,
265+
"freeze_language_model": True,
266+
"freeze_vision_model": True,
267+
"freeze_vision_projection": False,
268+
}
269+
combined_kwargs: Qwen3VLCommonKwargs = {**recommended_kwargs, **user_kwargs}
270+
return _qwen3_vl_common(**combined_kwargs)
271+
272+
185273
# =============================================================================
186274
# Qwen3.5-VL Dense SFT Configurations (800M, 2B, 4B, 9B, 27B)
187275
# =============================================================================

0 commit comments

Comments
 (0)