Source code for calorine.nep.training_factory

from os import makedirs
from pathlib import Path
from os.path import exists, join as join_path
from typing import List, NamedTuple, Optional

import numpy as np
from ase import Atoms
from sklearn.model_selection import KFold

from .io import write_nepfile, write_structures


[docs] def setup_training(parameters: NamedTuple, structures: List[Atoms], enforced_structures: List[int] = [], rootdir: str = '.', mode: str = 'kfold', n_splits: int = None, train_fraction: float = None, seed: int = 42, overwrite: bool = False, ) -> None: """Sets up the input files for training a NEP via the ``nep`` executable of the GPUMD package. Parameters ---------- parameters dictionary containing the parameters to be set in the nep.in file; see `here <https://gpumd.org/nep/input_parameters/index.html>`__ for an overview of these parameters structures list of structures to be included enforced_structures structures that _must_ be included in the training set, provided in the form of a list of indices that refer to the content of the ``structures`` parameter rootdir root directory in which to create the input files mode how the test-train split is performed. Options: ``'kfold'`` and ``'bagging'`` n_splits number of splits of the input structures in training and test sets that ought to be performed; by default no split will be done and all input structures will be used for training train_fraction fraction of structures to use for training when mode ``'bagging'`` is used seed random number generator seed to be used; this ensures reproducability overwrite if True overwrite the content of ``rootdir`` if it exists """ if exists(rootdir) and not overwrite: raise FileExistsError('Output directory exists.' ' Set overwrite=True in order to override this behavior.') if n_splits is not None and (n_splits <= 0 or n_splits > len(structures)): raise ValueError(f'n_splits ({n_splits}) must be positive and' f' must not exceed {len(structures)}.') if mode == 'kfold' and train_fraction is not None: raise ValueError(f'train_fraction cannot be set when mode {mode} is used') elif mode == 'bagging' and (train_fraction <= 0 or train_fraction > 1): raise ValueError(f'train_fraction ({train_fraction}) must be in (0,1]') rs = np.random.RandomState(seed) _prepare_training(parameters, structures, enforced_structures, rootdir, mode, n_splits, train_fraction, rs)
def _prepare_training(parameters: NamedTuple, structures: List[Atoms], enforced_structures: List[int], rootdir: str, mode: str, n_splits: Optional[int], train_fraction: Optional[float], rs: np.random.RandomState) -> None: """Prepares training and test sets and writes structural data as well as parameters files. See docstring for `setup_training` for documentation of parameters. """ dirname = join_path(rootdir, 'nepmodel_full') makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, list(set(range(len(structures)))), [0]) write_nepfile(parameters, dirname) if n_splits is None: return n_structures = len(structures) remaining_structures = list(set(range(n_structures)) - set(enforced_structures)) if mode == 'kfold': kf = KFold(n_splits=n_splits, shuffle=True, random_state=rs) for k, (train_indices, test_indices) in enumerate(kf.split(remaining_structures)): # append enforced structures at the end of the training set train_selection = [remaining_structures[x] for x in list(train_indices)] test_selection = [remaining_structures[x] for x in list(test_indices)] # sanity check: make sure there is no overlap between train and test assert set(train_selection).intersection(set(test_selection)) == set(), \ 'Train and test set should not overlap' subdir = f'nepmodel_split{k+1}' dirname = join_path(rootdir, subdir) makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, train_selection, test_selection) write_nepfile(parameters, dirname) elif mode == 'bagging': for k in range(n_splits): train_selection = rs.choice( remaining_structures, size=int(train_fraction * n_structures) - len(enforced_structures), replace=False) # append enforced structures at the end of the training set train_selection = list(train_selection) train_selection.extend(enforced_structures) # add the remaining structures to the test set test_selection = list(set(range(n_structures)) - set(train_selection)) # sanity check: make sure there is no overlap between train and test assert set(train_selection).intersection(set(test_selection)) == set(), \ 'Train and test set should not overlap' dirname = join_path(rootdir, f'nepmodel_split{k+1}') makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, train_selection, test_selection) write_nepfile(parameters, dirname) else: raise ValueError(f'Unknown value for mode: {mode}.') def _write_structures(structures: List[Atoms], dirname: str, train_selection: List[int], test_selection: List[int]): """Writes structures in format readable by nep executable. See docstring for `setup_training` for documentation of parameters. """ write_structures( join_path(dirname, 'train.xyz'), [s for k, s in enumerate(structures) if k in train_selection]) write_structures( join_path(dirname, 'test.xyz'), [s for k, s in enumerate(structures) if k in test_selection]) def setup_fine_tuning_nep89(parameters: NamedTuple, nep: Path, restart: Path, **kwargs_to_setup_training) -> None: """ Sets up a fine-tuning of the NEP89 foundation model. Note that only the types, the number of generations, the batch, the population, and the regularization parameters are allowed to be changed. The types must be a subset of the 89 types atomic species supported by the NEP89 foundation model. This function wraps :func:`setup_training`. Parameters ---------- parameters Dictionary containing the parameters to be set in the `nep.in` file; see `here <https://gpumd.org/nep/input_parameters/index.html>`__ for an overview of these parameters. Note that only `lambda_1`, `lambda_2`, `lambda_e`, `lambda_f`, `lambda_v`, `generation`, `population`, `type`, and `batch` are allowed parameters when fine-tuning. nep: Path to the `nep.txt` file for NEP89. restart: Path to the `nep.restart` file for NEP89. kwargs_to_setup_training: See the dosctring for `setup_training` for the rest of the parameters. """ allowed_parameters = ['type', 'lambda_1', 'lambda_2', 'lambda_e', 'lambda_f', 'lambda_v', 'generation', 'population', 'batch'] for param in parameters.keys(): if param not in allowed_parameters: raise ValueError(f'Parameter {param} not allowed when fine-tuning.') if not Path(nep).is_file(): raise FileNotFoundError(f'{nep} does not exist.') if not Path(restart).is_file(): raise FileNotFoundError(f'{restart} does not exist.') # Default parameters that need to be set for NEP89. nep89_parameters = dict(version=4, zbl=2, cutoff=[6, 5], n_max=[4, 4], basis_size=[8, 8], l_max=[4, 2, 1], neuron=80) fine_tuning = (dict(fine_tune=[str(nep), str(restart)]) | parameters | nep89_parameters) setup_training(fine_tuning, **kwargs_to_setup_training)