Coverage for calorine/tools/analysis.py: 100%

57 statements  

« prev     ^ index     » next       coverage.py v7.2.5, created at 2024-10-17 08:55 +0000

1from ase import units 

2from typing import Optional 

3 

4import numpy as np 

5import pandas as pd 

6from scipy import stats 

7 

8 

9def analyze_data(data: np.ndarray, max_lag: int = None) -> dict: 

10 """ Carries out an extensive analysis of the data series. 

11 

12 Parameters 

13 ---------- 

14 data 

15 data series to compute autocorrelation function for 

16 max_lag 

17 maximum lag between two data points, used for computing autocorrelation 

18 

19 Returns 

20 ------- 

21 dict 

22 calculated properties of the data including, mean, standard deviation, 

23 correlation length and a 95% error estimate. 

24 """ 

25 summary = dict(mean=data.mean(), 

26 std=data.std()) 

27 acf = get_autocorrelation_function(data, max_lag) 

28 correlation_length = _estimate_correlation_length_from_acf(acf) 

29 if correlation_length is not None: 

30 error_estimate = _estimate_error(data, correlation_length, confidence=0.95) 

31 summary['correlation_length'] = correlation_length 

32 summary['error_estimate'] = error_estimate 

33 else: 

34 summary['correlation_length'] = np.nan 

35 summary['error_estimate'] = np.nan 

36 return summary 

37 

38 

39def get_autocorrelation_function(data: np.ndarray, max_lag: int = None) -> np.ndarray: 

40 """ Returns autocorrelation function. 

41 

42 The autocorrelation function is computed using `pandas.Series.autocorr 

43 <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.autocorr.html>`_. 

44 

45 Parameters 

46 ---------- 

47 data 

48 data series to compute autocorrelation function for 

49 max_lag 

50 maximum lag between two data points 

51 

52 Returns 

53 ------- 

54 calculated autocorrelation function 

55 """ 

56 if max_lag is None: 

57 max_lag = len(data) - 1 

58 if max_lag < 1 or max_lag >= len(data): 

59 raise ValueError('max_lag should be between 1 and len(data)-1.') 

60 series = pd.Series(data) 

61 acf = [series.autocorr(lag) for lag in range(0, max_lag)] 

62 return np.array(acf) 

63 

64 

65def get_correlation_length(data: np.ndarray) -> Optional[int]: 

66 """ Returns estimate of the correlation length of data. 

67 

68 The correlation length is taken as the first point where the 

69 autocorrelation functions is less than :math:`\\exp(-2)`. If the 

70 correlation function never drops below :math:`\\exp(-2)` ``np.nan`` is 

71 returned. 

72 

73 If the correlation length cannot be computed since the auto-correlation 

74 function is unconverged the function returns `None`. 

75 

76 Parameters 

77 ---------- 

78 data 

79 data series for which to the compute autocorrelation function 

80 

81 Returns 

82 ------- 

83 correlation length 

84 """ 

85 

86 acf = get_autocorrelation_function(data) 

87 correlation_length = _estimate_correlation_length_from_acf(acf) 

88 if correlation_length is None: 

89 return None 

90 return correlation_length 

91 

92 

93def get_error_estimate(data: np.ndarray, confidence: float = 0.95) -> Optional[float]: 

94 """Returns estimate of standard error :math:`\\mathrm{error}` 

95 with confidence interval. 

96 

97 .. math:: 

98 

99 \\mathrm{error} = t_\\mathrm{factor} * \\mathrm{std}(\\mathrm{data}) / \\sqrt{N_s} 

100 

101 where :math:`t_{factor}` is the factor corresponding to the confidence 

102 interval and :math:`N_s` is the number of independent measurements 

103 (with correlation taken into account). 

104 

105 If the correlation length cannot be computed since the auto-correlation 

106 function is unconverged the function returns `None`. 

107 

108 Parameters 

109 ---------- 

110 data 

111 data series for which to estimate the error 

112 

113 Returns 

114 ------- 

115 error estimate 

116 

117 """ 

118 correlation_length = get_correlation_length(data) 

119 if correlation_length is None: 

120 return None 

121 error_estimate = _estimate_error(data, correlation_length, confidence) 

122 return error_estimate 

123 

124 

125def _estimate_correlation_length_from_acf(acf: np.ndarray) -> Optional[int]: 

126 """Estimates correlation length from acf. Returns None if the auto-correlation 

127 function is uncoverged.""" 

128 for i, a in enumerate(acf): 

129 if a < np.exp(-2): 

130 return i 

131 return None # np.nan 

132 

133 

134def _estimate_error(data: np.ndarray, 

135 correlation_length: int, 

136 confidence: float) -> float: 

137 """ Estimates error using correlation length. """ 

138 t_factor = stats.t.ppf((1 + confidence) / 2, len(data)-1) # type: float 

139 error = t_factor * np.std(data) / np.sqrt(len(data) / correlation_length) # type: float 

140 return error 

141 

142 

143def get_rtc_from_hac(hac: np.ndarray, V: float, T: float, dt: float) -> np.ndarray: 

144 """Returns the running thermal conductivity (RTC) in W/m/K using a 

145 heat auto-correlation (HAC) as input. 

146 

147 Parameters 

148 ---------- 

149 hac 

150 The HAC :math:`\\langle j(t)j(0)\\rangle` in units of eV\\ :sup:`3`/amu as given by GPUMD 

151 V 

152 Volume of cell in Å\\ :sup:`3` 

153 T 

154 Temperature in Kelvin 

155 dt 

156 Time step in the HAC in ps 

157 running thermal conductivity in W/m/K 

158 """ 

159 

160 hac = np.asarray(hac) 

161 

162 if hac.ndim > 1: 

163 raise ValueError('hac must be 1D') 

164 

165 kB = units.kB # eV/K 

166 

167 J_per_eV = 1 / units.J 

168 kg_per_u = 1 / units.kg 

169 s_per_ps = 1e-12 

170 m_per_Å = 1 / units.m 

171 

172 # The integration and normalization 

173 rtc = np.cumsum(hac) * dt / (kB * T**2 * V) 

174 

175 # From eV/ps/Å/K to J/s/m/K 

176 rtc *= (J_per_eV**3 / kg_per_u) * s_per_ps / (J_per_eV * m_per_Å**3) 

177 

178 return rtc