Source code for calorine.nep.training_factory

from os import makedirs
from os.path import exists, join as join_path
from typing import List, NamedTuple, Optional

import numpy as np
from ase import Atoms
from sklearn.model_selection import KFold

from .io import write_nepfile, write_structures



[docs]
def setup_training(parameters: NamedTuple,
                   structures: List[Atoms],
                   enforced_structures: List[int] = [],
                   rootdir: str = '.',
                   mode: str = 'kfold',
                   n_splits: int = None,
                   train_fraction: float = None,
                   seed: int = 42,
                   overwrite: bool = False,
                   ) -> None:
    """Sets up the input files for training a NEP via the ``nep``
    executable of the GPUMD package.

    Parameters
    ----------
    parameters
        dictionary containing the parameters to be set in the nep.in file;
        see `here <https://gpumd.org/nep/input_parameters/index.html>`__
        for an overview of these parameters
    structures
        list of structures to be included
    enforced_structures
        structures that _must_ be included in the training set, provided in the form
        of a list of indices that refer to the content of the ``structures`` parameter
    rootdir
        root directory in which to create the input files
    mode
        how the test-train split is performed. Options: ``'kfold'`` and ``'bagging'``
    n_splits
        number of splits of the input structures in training and test sets that ought to be
        performed; by default no split will be done and all input structures will be used
        for training
    train_fraction
        fraction of structures to use for training when mode ``'bagging'`` is used
    seed
        random number generator seed to be used; this ensures reproducability
    overwrite
        if True overwrite the content of ``rootdir`` if it exists
    """
    if exists(rootdir) and not overwrite:
        raise FileExistsError('Output directory exists.'
                              ' Set overwrite=True in order to override this behavior.')

    if n_splits is not None and (n_splits <= 0 or n_splits > len(structures)):
        raise ValueError(f'n_splits ({n_splits}) must be positive and'
                         f' must not exceed {len(structures)}.')

    if mode == 'kfold' and train_fraction is not None:
        raise ValueError(f'train_fraction cannot be set when mode {mode} is used')
    elif mode == 'bagging' and (train_fraction <= 0 or train_fraction > 1):
        raise ValueError(f'train_fraction ({train_fraction}) must be in (0,1]')

    rs = np.random.RandomState(seed)
    _prepare_training(parameters, structures, enforced_structures,
                      rootdir, mode, n_splits, train_fraction, rs)



def _prepare_training(parameters: NamedTuple,
                      structures: List[Atoms],
                      enforced_structures: List[int],
                      rootdir: str,
                      mode: str,
                      n_splits: Optional[int],
                      train_fraction: Optional[float],
                      rs: np.random.RandomState) -> None:
    """Prepares training and test sets and writes structural data as well as parameters files.

    See class-level docstring for documentation of parameters.
    """
    dirname = join_path(rootdir, 'nepmodel_full')
    makedirs(dirname, exist_ok=True)
    _write_structures(structures, dirname, list(set(range(len(structures)))), [0])
    write_nepfile(parameters, dirname)

    if n_splits is None:
        return

    n_structures = len(structures)
    remaining_structures = list(set(range(n_structures)) - set(enforced_structures))

    if mode == 'kfold':
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=rs)
        for k, (train_indices, test_indices) in enumerate(kf.split(remaining_structures)):
            # append enforced structures at the end of the training set
            train_selection = [remaining_structures[x] for x in list(train_indices)]
            test_selection = [remaining_structures[x] for x in list(test_indices)]

            # sanity check: make sure there is no overlap between train and test
            assert set(train_selection).intersection(set(test_selection)) == set(), \
                'Train and test set should not overlap'

            subdir = f'nepmodel_split{k+1}'
            dirname = join_path(rootdir, subdir)
            makedirs(dirname, exist_ok=True)
            _write_structures(structures, dirname, train_selection, test_selection)
            write_nepfile(parameters, dirname)

    elif mode == 'bagging':
        for k in range(n_splits):
            train_selection = rs.choice(
                remaining_structures,
                size=int(train_fraction * n_structures) - len(enforced_structures),
                replace=False)

            # append enforced structures at the end of the training set
            train_selection = list(train_selection)
            train_selection.extend(enforced_structures)

            # add the remaining structures to the test set
            test_selection = list(set(range(n_structures)) - set(train_selection))

            # sanity check: make sure there is no overlap between train and test
            assert set(train_selection).intersection(set(test_selection)) == set(), \
                'Train and test set should not overlap'

            dirname = join_path(rootdir, f'nepmodel_split{k+1}')
            makedirs(dirname, exist_ok=True)
            _write_structures(structures, dirname, train_selection, test_selection)
            write_nepfile(parameters, dirname)

    else:
        raise ValueError(f'Unknown value for mode: {mode}.')


def _write_structures(structures: List[Atoms],
                      dirname: str,
                      train_selection: List[int],
                      test_selection: List[int]):
    """Writes structures in format readable by nep executable.

    See class-level docstring for documentation of parameters.
    """
    write_structures(
        join_path(dirname, 'train.xyz'),
        [s for k, s in enumerate(structures) if k in train_selection])
    write_structures(
        join_path(dirname, 'test.xyz'),
        [s for k, s in enumerate(structures) if k in test_selection])