diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 308d39b0a3..0f7acb4266 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2993,8 +2993,9 @@ def training_data_args() -> list[ link_sys = make_link("systems", "training/training_data/systems") doc_systems = ( "The data systems for training. " - "This key can be provided with a list that specifies the systems, or be provided with a string " - "by which the prefix of all systems are given and the list of the systems is automatically generated." + "This key can be a list or a str. " + "When provided as a string, it can be a system directory path (containing 'type.raw') or a parent directory path to recursively search for all system subdirectories. " + "When provided as a list, each string item in the list is processed the same way as individual string inputs, i.e., each path can be a system directory or a parent directory to recursively search for all system subdirectories." ) doc_patterns = ( "The customized patterns used in `rglob` to collect all training systems. " @@ -3074,8 +3075,9 @@ def validation_data_args() -> list[ link_sys = make_link("systems", "training/validation_data/systems") doc_systems = ( "The data systems for validation. " - "This key can be provided with a list that specifies the systems, or be provided with a string " - "by which the prefix of all systems are given and the list of the systems is automatically generated." + "This key can be a list or a str. " + "When provided as a string, it can be a system directory path (containing 'type.raw') or a parent directory path to recursively search for all system subdirectories. " + "When provided as a list, each string item in the list is processed the same way as individual string inputs, i.e., each path can be a system directory or a parent directory to recursively search for all system subdirectories." ) doc_patterns = ( "The customized patterns used in `rglob` to collect all validation systems. " diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py index cf6e81aad1..4f22b3c380 100644 --- a/deepmd/utils/data_system.py +++ b/deepmd/utils/data_system.py @@ -790,6 +790,7 @@ def process_systems( """Process the user-input systems. If it is a single directory, search for all the systems in the directory. + If it is a list, each item in the list is treated as a directory to search. Check if the systems are valid. Parameters @@ -801,17 +802,31 @@ def process_systems( Returns ------- - list of str + result_systems: list of str The valid systems """ + # Normalize input to a list of paths to search if isinstance(systems, str): + search_paths = [systems] + elif isinstance(systems, list): + search_paths = systems + else: + # Handle unsupported input types + raise ValueError( + f"Invalid systems type: {type(systems)}. Must be str or list[str]." + ) + + # Iterate over the search_paths list and apply expansion logic to each path + result_systems = [] + for path in search_paths: if patterns is None: - systems = expand_sys_str(systems) + expanded_paths = expand_sys_str(path) else: - systems = rglob_sys_str(systems, patterns) - elif isinstance(systems, list): - systems = systems.copy() - return systems + expanded_paths = rglob_sys_str(path, patterns) + + result_systems.extend(expanded_paths) + + return result_systems def get_data( diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index 174c39d6d9..af4b4b31d9 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -76,8 +76,8 @@ Other training parameters are given in the {ref}`training ` section. The sections {ref}`training_data ` and {ref}`validation_data ` give the training dataset and validation dataset, respectively. Taking the training dataset for example, the keys are explained below: - {ref}`systems ` provide paths of the training data systems. DeePMD-kit allows you to provide multiple systems with different numbers of atoms. This key can be a `list` or a `str`. - - `list`: {ref}`systems ` gives the training data systems. - - `str`: {ref}`systems ` should be a valid path. DeePMD-kit will recursively search all data systems in this path. + - `str`: {ref}`systems ` should be a valid path. It can be a system directory path (containing 'type.raw') or a parent directory path to recursively search for all system subdirectories. + - `list`: {ref}`systems ` gives a list of paths. Each string item in the list is processed the same way as individual string inputs, i.e., each path can be a system directory or a parent directory to recursively search for all system subdirectories. - At each training step, DeePMD-kit randomly picks {ref}`batch_size ` frame(s) from one of the systems. The probability of using a system is by default in proportion to the number of batches in the system. More options are available for automatically determining the probability of using systems. One can set the key {ref}`auto_prob ` to - `"prob_uniform"` all systems are used with the same probability. - `"prob_sys_size"` the probability of using a system is proportional to its size (number of frames).