Source code for calorine.tools.analysis

from ase import units
from typing import Optional

import numpy as np
import pandas as pd
from scipy import stats



[docs]
def analyze_data(data: np.ndarray, max_lag: int = None) -> dict:
    """ Carries out an extensive analysis of the data series.

    Parameters
    ----------
    data
        data series to compute autocorrelation function for
    max_lag
        maximum lag between two data points, used for computing autocorrelation

    Returns
    -------
    dict
        calculated properties of the data including, mean, standard deviation,
        correlation length and a 95% error estimate.
    """
    summary = dict(mean=data.mean(),
                   std=data.std())
    acf = get_autocorrelation_function(data, max_lag)
    correlation_length = _estimate_correlation_length_from_acf(acf)
    if correlation_length is not None:
        error_estimate = _estimate_error(data, correlation_length, confidence=0.95)
        summary['correlation_length'] = correlation_length
        summary['error_estimate'] = error_estimate
    else:
        summary['correlation_length'] = np.nan
        summary['error_estimate'] = np.nan
    return summary




[docs]
def get_autocorrelation_function(data: np.ndarray, max_lag: int = None) -> np.ndarray:
    """ Returns autocorrelation function.

    The autocorrelation function is computed using `pandas.Series.autocorr
    <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.autocorr.html>`_.

    Parameters
    ----------
    data
        data series to compute autocorrelation function for
    max_lag
        maximum lag between two data points

    Returns
    -------
        calculated autocorrelation function
    """
    if max_lag is None:
        max_lag = len(data) - 1
    if max_lag < 1 or max_lag >= len(data):
        raise ValueError('max_lag should be between 1 and len(data)-1.')
    series = pd.Series(data)
    acf = [series.autocorr(lag) for lag in range(0, max_lag)]
    return np.array(acf)




[docs]
def get_correlation_length(data: np.ndarray) -> Optional[int]:
    """ Returns estimate of the correlation length of data.

    The correlation length is taken as the first point where the
    autocorrelation functions is less than :math:`\\exp(-2)`. If the
    correlation function never drops below :math:`\\exp(-2)` ``np.nan`` is
    returned.

    If the correlation length cannot be computed since the auto-correlation
    function is unconverged the function returns `None`.

    Parameters
    ----------
    data
        data series for which to the compute autocorrelation function

    Returns
    -------
        correlation length
    """

    acf = get_autocorrelation_function(data)
    correlation_length = _estimate_correlation_length_from_acf(acf)
    if correlation_length is None:
        return None
    return correlation_length




[docs]
def get_error_estimate(data: np.ndarray, confidence: float = 0.95) -> Optional[float]:
    """Returns estimate of standard error :math:`\\mathrm{error}`
    with confidence interval.

    .. math::

       \\mathrm{error} = t_\\mathrm{factor} * \\mathrm{std}(\\mathrm{data}) / \\sqrt{N_s}

    where :math:`t_{factor}` is the factor corresponding to the confidence
    interval and :math:`N_s` is the number of independent measurements
    (with correlation taken into account).

    If the correlation length cannot be computed since the auto-correlation
    function is unconverged the function returns `None`.

    Parameters
    ----------
    data
        data series for which to estimate the error

    Returns
    -------
        error estimate

    """
    correlation_length = get_correlation_length(data)
    if correlation_length is None:
        return None
    error_estimate = _estimate_error(data, correlation_length, confidence)
    return error_estimate



def _estimate_correlation_length_from_acf(acf: np.ndarray) -> Optional[int]:
    """Estimates correlation length from acf. Returns None if the auto-correlation
    function is uncoverged."""
    for i, a in enumerate(acf):
        if a < np.exp(-2):
            return i
    return None  # np.nan


def _estimate_error(data: np.ndarray,
                    correlation_length: int,
                    confidence: float) -> float:
    """ Estimates error using correlation length. """
    t_factor = stats.t.ppf((1 + confidence) / 2, len(data)-1)  # type: float
    error = t_factor * np.std(data) / np.sqrt(len(data) / correlation_length)  # type: float
    return error



[docs]
def get_rtc_from_hac(hac: np.ndarray, V: float, T: float, dt: float) -> np.ndarray:
    """Returns the running thermal conductivity (RTC) in W/m/K using a
    heat auto-correlation (HAC) as input.

    Parameters
    ----------
    hac
        The HAC :math:`\\langle j(t)j(0)\\rangle` in units of eV\\ :sup:`3`/amu as given by GPUMD
    V
        Volume of cell in Å\\ :sup:`3`
    T
        Temperature in Kelvin
    dt
        Time step in the HAC in ps
       running thermal conductivity in W/m/K
    """

    hac = np.asarray(hac)

    if hac.ndim > 1:
        raise ValueError('hac must be 1D')

    kB = units.kB  # eV/K

    J_per_eV = 1 / units.J
    kg_per_u = 1 / units.kg
    s_per_ps = 1e-12
    m_per_Å = 1 / units.m

    # The integration and normalization
    rtc = np.cumsum(hac) * dt / (kB * T**2 * V)

    # From eV/ps/Å/K to J/s/m/K
    rtc *= (J_per_eV**3 / kg_per_u) * s_per_ps / (J_per_eV * m_per_Å**3)

    return rtc