Source code for capalyzer.packet_parser.diversity_metrics
import math
import pandas as pd
from scipy.stats import gmean, entropy
from numpy.linalg import norm
from random import random, sample
import numpy as np
MIL = 1000 * 1000
# ALPHA Diversity`
[docs]def shannon_entropy(row, rarefy=0):
"""Return the shannon entropy of an iterable.
Shannon entropy is robust to rarefaction but we keep
the param for consistency.
"""
row_sum, H = sum(row), 0
for val in row:
val = val / row_sum
if val == 0:
continue
H += val * math.log2(val)
if H < 0:
H *= -1
return H
[docs]def richness(row, rarefy=0, count=False):
"""Return the richness of an iterable."""
if count:
return sum(row > 0)
row_sum, R = sum(row), 0
for val in row:
prob_success = val / row_sum
prob_fail = 1 - prob_success
prob_detect = 1 - (prob_fail ** rarefy)
if val and rarefy <= 0:
R += 1
else:
R += prob_detect
return int(R + 0.5)
[docs]def chao1(row, rarefy=0):
"""Return richnes of an iterable"""
row_sum, R, S, D = sum(row), 0, 0, 0.0000001
num_reads = MIL if math.isclose(row_sum, 1) else row_sum # default to 1M reads if compositional
num_reads = rarefy if rarefy > 0 else num_reads # if rarefy is set use that as read count
for val in row:
prob_success = val / row_sum
prob_fail = 1 - prob_success
prob_detect = 1 - (prob_fail ** num_reads)
if rarefy:
R += prob_detect
elif val:
R += 1
S += 1 if val == 1 else 0
D += 1 if val == 2 else 0
return R + (S ** 2) / (2 * D)
# Beta Diversity
[docs]def clr(X):
_X = X + 0.0000001
_X = _X / norm(_X, ord=1)
g = gmean(_X)
_X = np.divide(_X, g)
_X = np.log(_X)
return _X
[docs]def rho_proportionality(P, Q):
_P, _Q = clr(P), clr(Q)
N = np.var(_P - _Q)
D = np.var(_P) + np.var(_Q)
return 1 - (N / D)
[docs]def jensen_shannon_dist(P, Q):
_P = P / norm(P, ord=1)
_Q = Q / norm(Q, ord=1)
_M = 0.5 * (_P + _Q)
J = 0.5 * (entropy(_P, _M) + entropy(_Q, _M))
return math.sqrt(J)
# Rarefaction
[docs]def single_rarefaction(tbl, n=0):
"""Return the number of nonzero columns in tbl.
Select n rows at random if specified.
"""
if n and n > 0 and n < tbl.shape[0]:
tbl = tbl.loc[sample(list(tbl.index), n)]
return sum(tbl.sum(axis=0) > 0)
[docs]def rarefaction_analysis(tbl, ns=[], nsample=16, include_all=True):
"""Return a dataframe with two columns.
N, the number of samples and Taxa, the number of nonzero elements.
"""
result = []
if not ns:
ns = range(tbl.shape[0])
if include_all:
ns = list(ns) + [tbl.shape[0]]
for n in ns:
for _ in range(nsample):
result.append((n, single_rarefaction(tbl, n=n)))
return pd.DataFrame(result, columns=['N', 'Taxa'])