PaddlePaddle
diff --git a/‎examples/experiments/auto_parallel/llama/run_pretrain_auto.py‎
Lines changed: 25 additions & 25 deletions b/‎examples/experiments/auto_parallel/llama/run_pretrain_auto.py‎
Lines changed: 25 additions & 25 deletions
diff --git a/‎paddleformers/transformers/__init__.py‎
Lines changed: 0 additions & 35 deletions b/‎paddleformers/transformers/__init__.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎paddleformers/transformers/configuration_utils.py‎
Lines changed: 10 additions & 0 deletions b/‎paddleformers/transformers/configuration_utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎paddleformers/transformers/llama/__init__.py‎
Lines changed: 0 additions & 35 deletions b/‎paddleformers/transformers/llama/__init__.py‎
Lines changed: 0 additions & 35 deletions
diff --git a/‎paddleformers/transformers/llama/auto_dist_config.py‎
Lines changed: 113 additions & 0 deletions b/‎paddleformers/transformers/llama/auto_dist_config.py‎
Lines changed: 113 additions & 0 deletions
@@ -24,41 +24,32 @@
 import paddle
 import paddle.distributed as dist
 
+from paddleformers.data.causal_dataset import (
+    build_train_valid_test_datasets,
+    check_data_split,
+    print_rank_0,
+)
 from paddleformers.ops import Topology
 from paddleformers.trainer import PdArgumentParser, get_last_checkpoint
-from paddleformers.trainer.auto_trainer import AutoTrainer
-from paddleformers.trainer.auto_training_args import AutoTrainingArguments
+from paddleformers.trainer.trainer import Trainer
 from paddleformers.trainer.trainer_utils import IntervalStrategy, _get_distributed_seeds
+from paddleformers.trainer.training_args import TrainingArguments
+from paddleformers.trainer.utils.doc import add_start_docstrings
 from paddleformers.transformers import (
     AutoTokenizer,
     CosineAnnealingWithWarmupDecay,
     LinearAnnealingWithWarmupDecay,
     LlamaConfig,
-    LlamaForCausalLM3DAuto,
-    LlamaForCausalLMNet,
-    LlamaPretrainingCriterion3DAuto,
-    LlamaPretrainingCriterionNet,
+    LlamaForCausalLM,
+    LlamaPretrainingCriterion,
 )
 from paddleformers.utils.log import logger
-
-MODEL_CLASSES = {
-    "llama": (LlamaConfig, LlamaForCausalLM3DAuto, LlamaPretrainingCriterion3DAuto),
-    "llama_network": (LlamaConfig, LlamaForCausalLMNet, LlamaPretrainingCriterionNet),
-}
-
-
-from paddleformers.data.causal_dataset import (
-    build_train_valid_test_datasets,
-    check_data_split,
-    print_rank_0,
-)
-from paddleformers.trainer.utils.doc import add_start_docstrings
 from paddleformers.utils.tools import get_env_device
 
 
 @dataclass
-@add_start_docstrings(AutoTrainingArguments.__doc__)
-class PreTrainingArguments(AutoTrainingArguments):
+@add_start_docstrings(TrainingArguments.__doc__)
+class PreTrainingArguments(TrainingArguments):
     min_learning_rate: float = field(
         default=1e-5,
         metadata={"help": "Minimum learning rate deacyed to."},
@@ -338,7 +329,7 @@ def get_train_data_file(args):
     return files
 
 
-class PretrainingTrainer(AutoTrainer):
+class PretrainingTrainer(Trainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.is_pretraining = True
@@ -474,7 +465,9 @@ def main():
                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
             )
 
-    config_class, model_class, criterion_class = MODEL_CLASSES[model_args.model_type]
+    config_class = LlamaConfig
+    model_class = LlamaForCausalLM
+    criterion_class = LlamaPretrainingCriterion
 
     config = config_class.from_pretrained(model_args.model_name_or_path)
     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)
@@ -542,8 +535,6 @@ def main():
             # It's OK, not use accumulate_steps optimization
             pass
 
-    print("Final pre-training config:", config)
-
     if (
         "replace_with_parallel_cross_entropy" in training_args.tensor_parallel_config
         and config.tensor_parallel_degree > 1
@@ -553,6 +544,15 @@ def main():
 
         replace_cross_entropy()
 
+    if training_args.use_intermediate_api:
+        config.run_single_model = True
+        config.tensor_parallel_degree = 1
+        config.sharding_parallel_degree = 1
+        config.sep_parallel_degree = 1
+        config.context_parallel_degree = 1
+
+    print("Final pre-training config:", config)
+
     # # Set the dtype for loading model
     # dtype = "float32"
     # if training_args.fp16_opt_level == "O2":
 
@@ -210,41 +210,6 @@
         "LlamaPretrainingCriterion",
         "LlamaNTKScalingRotaryEmbedding",
     ],
-    "llama.modeling_auto": [
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaDecoderLayerAuto",
-        "LlamaAttentionAuto",
-        "LlamaPretrainedModelAuto",
-        "LlamaLMHeadAuto",
-        "LlamaModelAuto",
-        "LlamaForCausalLM3DAuto",
-        "LlamaMLPAuto",
-        "get_mesh",
-        "LlamaRMSNormAuto",
-        "is_pp_enable",
-        "LlamaPretrainingCriterion3DAuto",
-        "global_mesh_starts_with_pp",
-        "scaled_dot_product_attention",
-    ],
-    "llama.modeling_network": [
-        "LlamaPretrainedModelNet",
-        "layer_input_parallel_row_and_col_hook",
-        "LlamaModelNet",
-        "LlamaPretrainingCriterionNet",
-        "layer_input_replicate_hook",
-        "LlamaLMHeadNet",
-        "LlamaForCausalLMNetDPO",
-        "GlobalOutputNet",
-        "layer_input_parallel_row_hook",
-        "LlamaRMSNormNet",
-        "LlamaAttentionNet",
-        "scaled_dot_product_attention",
-        "ReshardLayer",
-        "LlamaForCausalLMNet",
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaMLPNet",
-        "LlamaDecoderLayerNet",
-    ],
     "llama.modeling_pp": ["LlamaForCausalLMPipe"],
     "llama.tokenizer": ["LlamaTokenizer", "Llama3Tokenizer"],
     "llama.tokenizer_fast": ["LlamaTokenizerFast"],
 
@@ -537,6 +537,9 @@ class PretrainedConfig:
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
 
+        run_single_model (`bool`, *optional*, defaults to `False`):
+            Whether to run the model in single card mode. When enabled, all parallel degree configurations will be disabled.
+
         dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -601,6 +604,13 @@ def __init__(self, **kwargs):
         self.use_cache = kwargs.pop("use_cache", False)
         self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
+        # for run model in single card mode
+        self.run_single_model = kwargs.pop("run_single_model", False)
+        if self.run_single_model:
+            self.tensor_parallel_degree = 1
+            self.sep_parallel_degree = 1
+            self.context_parallel_degree = 1
+
         # for transformers fuse
         self.fuse_linear = kwargs.pop("fuse_linear", False)
         self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
 
@@ -50,41 +50,6 @@
         "LlamaPretrainingCriterion",
         "LlamaNTKScalingRotaryEmbedding",
     ],
-    "modeling_auto": [
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaDecoderLayerAuto",
-        "LlamaAttentionAuto",
-        "LlamaPretrainedModelAuto",
-        "LlamaLMHeadAuto",
-        "LlamaModelAuto",
-        "LlamaForCausalLM3DAuto",
-        "LlamaMLPAuto",
-        "get_mesh",
-        "LlamaRMSNormAuto",
-        "is_pp_enable",
-        "LlamaPretrainingCriterion3DAuto",
-        "global_mesh_starts_with_pp",
-        "scaled_dot_product_attention",
-    ],
-    "modeling_network": [
-        "LlamaPretrainedModelNet",
-        "layer_input_parallel_row_and_col_hook",
-        "LlamaModelNet",
-        "LlamaPretrainingCriterionNet",
-        "layer_input_replicate_hook",
-        "LlamaLMHeadNet",
-        "LlamaForCausalLMNetDPO",
-        "GlobalOutputNet",
-        "layer_input_parallel_row_hook",
-        "LlamaRMSNormNet",
-        "LlamaAttentionNet",
-        "scaled_dot_product_attention",
-        "ReshardLayer",
-        "LlamaForCausalLMNet",
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaMLPNet",
-        "LlamaDecoderLayerNet",
-    ],
     "modeling_pp": ["LlamaForCausalLMPipe"],
     "tokenizer": ["LlamaTokenizer", "Llama3Tokenizer"],
     "tokenizer_fast": ["LlamaTokenizerFast"],
 
@@ -0,0 +1,113 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle.distributed as dist
+from paddle.distributed.auto_parallel.intermediate.tensor_parallel import (
+    PrepareLayerInput,
+)
+
+
+def layer_input_parallel_row_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_parallel_row_and_col_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Shard(1)])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_replicate_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Replicate(), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Replicate(), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Replicate(), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def auto_dist_config(self, prefix=""):
+    if prefix != "":
+        assert prefix.endswith(".")
+    config = {
+        "sp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": [
+                    dist.ColWiseParallel(),
+                    dist.SequenceParallelBegin(),
+                ],
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn": dist.SequenceParallelDisable(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp": dist.SequenceParallelDisable(need_transpose=False),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+                f"{prefix}lm_head": dist.SequenceParallelEnd(),
+            }
+        },
+        "mp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": dist.ColWiseParallel(gather_output=True),
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+            }
+        },
+        "pp_config": {"split_spec": f"{prefix}llama.layers", "global_spec": f"{prefix}llama.global_layer"},
+    }
+
+    return config