# Source code for sctools.stats

```"""
Statistics Functions for Sequence Data Analysis
===============================================

.. currentmodule:: sctools

This module implements statistical modules for sequence analysis

Methods
-------
base4_entropy(x: np.array, axis: int=1)
calculate the entropy of a 4 x sequence length base frequency matrix

Classes
-------
OnlineGaussianSuficientStatistic        Empirical (online) calculation of mean and variance

"""

from typing import Tuple
import numpy as np

[docs]def base4_entropy(x, axis=1):
"""Calculate entropy in base four of a data matrix x

Useful for measuring DNA entropy (with 4 nucleotides) as the output is restricted to [0, 1]

Parameters
----------
x : np.ndarray
array of dimension one or more containing numeric types
axis : int, optional
axis to calculate entropy across. Values in this axis are treated as observation frequencies

Returns
-------
entropy : np.ndarray
array of input dimension - 1 containin entropy values bounded in [0, 1]

"""

# convert to probabilities
if axis == 1:
x = np.divide(x, np.sum(x, axis=axis)[:, None])
else:
x = np.divide(x, np.sum(x, axis=axis))

with np.errstate(divide="ignore"):
r = np.log(x) / np.log(4)

# convention: 0 * log(0) = 0, != -INF.
r[np.isinf(r)] = 0

return np.abs(-1 * np.sum(x * r, axis=axis))

[docs]class OnlineGaussianSufficientStatistic:
"""
Implementation of Welford's online mean and variance algorithm

Methods
-------
update(new_value: float)
incorporate new_value into the online estimate of mean and variance
mean()
return the mean value
calculate_variance()
calculate and return the variance
mean_and_variance()
return both mean and variance

"""

__slots__ = ["_count", "_mean", "_mean_squared_error"]

def __init__(self):
self._mean_squared_error: float = 0.0
self._mean: float = 0.0
self._count: int = 0

[docs]    def update(self, new_value: float) -> None:
self._count += 1
delta = new_value - self._mean
self._mean += delta / self._count
delta2 = new_value - self._mean
self._mean_squared_error += delta * delta2

@property
def mean(self) -> float:
"""return the mean value"""
return self._mean

[docs]    def calculate_variance(self):
"""calculate and return the variance"""
if self._count < 2:
return float("nan")
else:
return self._mean_squared_error / (self._count - 1)

[docs]    def mean_and_variance(self) -> Tuple[float, float]:
"""calculate and return the mean and variance"""
return self.mean, self.calculate_variance()
```