Source code for calorine.nep.training_factory

from os import makedirs
from os.path import exists, join as join_path
from typing import List, NamedTuple, Optional

import numpy as np
from ase import Atoms
from sklearn.model_selection import KFold

from .io import write_nepfile, write_structures


[docs]def setup_training(parameters: NamedTuple, structures: List[Atoms], enforced_structures: List[int] = [], rootdir: str = '.', mode: str = 'kfold', n_splits: int = None, train_fraction: float = None, seed: int = 42, overwrite: bool = False, ) -> None: """Sets up the input files for training a NEP via the ``nep`` executable of the GPUMD package. Parameters ---------- parameters dictionary containing the parameters to be set in the nep.in file; see `here <https://gpumd.org/nep/input_parameters/index.html>`__ for an overview of these parameters structures list of structures to be included enforced_structures structures that _must_ be included in the training set, provided in the form of a list of indices that refer to the content of the ``structures`` parameter rootdir root directory in which to create the input files mode how the test-train split is performed. Options: ``'kfold'`` and ``'bagging'`` n_splits number of splits of the input structures in training and test sets that ought to be performed; by default no split will be done and all input structures will be used for training train_fraction fraction of structures to use for training when mode ``'bagging'`` is used seed random number generator seed to be used; this ensures reproducability overwrite if True overwrite the content of ``rootdir`` if it exists """ if exists(rootdir) and not overwrite: raise FileExistsError('Output directory exists.' ' Set overwrite=True in order to override this behavior.') if n_splits is not None and (n_splits <= 0 or n_splits > len(structures)): raise ValueError(f'n_splits ({n_splits}) must be positive and' f' must not exceed {len(structures)}.') if mode == 'kfold' and train_fraction is not None: raise ValueError(f'train_fraction cannot be set when mode {mode} is used') elif mode == 'bagging' and (train_fraction <= 0 or train_fraction > 1): raise ValueError(f'train_fraction ({train_fraction}) must be in (0,1]') rs = np.random.RandomState(seed) _prepare_training(parameters, structures, enforced_structures, rootdir, mode, n_splits, train_fraction, rs)
def _prepare_training(parameters: NamedTuple, structures: List[Atoms], enforced_structures: List[int], rootdir: str, mode: str, n_splits: Optional[int], train_fraction: Optional[float], rs: np.random.RandomState) -> None: """Prepares training and test sets and writes structural data as well as parameters files. See class-level docstring for documentation of parameters. """ dirname = join_path(rootdir, 'nepmodel_full') makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, list(set(range(len(structures)))), [0]) write_nepfile(parameters, dirname) if n_splits is None: return n_structures = len(structures) remaining_structures = list(set(range(n_structures)) - set(enforced_structures)) if mode == 'kfold': kf = KFold(n_splits=n_splits, shuffle=True, random_state=rs) for k, (train_selection, test_selection) in enumerate(kf.split(remaining_structures)): # append enforced structures at the end of the training set train_selection = list(train_selection) train_selection.extend(enforced_structures) # sanity check: make sure there is no overlap between train and test assert set(train_selection).intersection(set(test_selection)) == set(), \ 'Train and test set should not overlap' subdir = f'nepmodel_split{k+1}' dirname = join_path(rootdir, subdir) makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, train_selection, test_selection) write_nepfile(parameters, dirname) elif mode == 'bagging': for k in range(n_splits): train_selection = rs.choice( remaining_structures, size=int(train_fraction * n_structures) - len(enforced_structures), replace=False) # append enforced structures at the end of the training set train_selection = list(train_selection) train_selection.extend(enforced_structures) # add the remaining structures to the test set test_selection = list(set(range(n_structures)) - set(train_selection)) # sanity check: make sure there is no overlap between train and test assert set(train_selection).intersection(set(test_selection)) == set(), \ 'Train and test set should not overlap' dirname = join_path(rootdir, f'nepmodel_split{k+1}') makedirs(dirname, exist_ok=True) _write_structures(structures, dirname, train_selection, test_selection) write_nepfile(parameters, dirname) else: raise ValueError(f'Unknown value for mode: {mode}.') def _write_structures(structures: List[Atoms], dirname: str, train_selection: List[int], test_selection: List[int]): """Writes structures in format readable by nep executable. See class-level docstring for documentation of parameters. """ write_structures( join_path(dirname, 'train.xyz'), [s for k, s in enumerate(structures) if k in train_selection]) write_structures( join_path(dirname, 'test.xyz'), [s for k, s in enumerate(structures) if k in test_selection])