import pandas as pd
from os.path import join
from scipy.spatial.distance import pdist, squareform
from .diversity_metrics import (
shannon_entropy,
richness,
chao1,
jensen_shannon_dist,
rho_proportionality,
rarefaction_analysis,
)
from .normalize import subsample, proportions
from .longform_parser import parse_longform_taxa
from ..constants import (
CARD_RPKM,
CARD_RPKMG,
MEGARES_CLASS_RPKM,
MEGARES_CLASS_RPKMG,
MEGARES_GENE_RPKM,
MEGARES_GENE_RPKMG,
MEGARES_GROUP_RPKM,
MEGARES_GROUP_RPKMG,
MEGARES_MECH_RPKM,
MEGARES_MECH_RPKMG,
AVE_GENOME_SIZE,
HMP_COMPARISON,
MACROBES,
READ_PROPORTIONS,
READ_STATS,
UNIREF90_COV,
UNIREF90_RELAB,
MPA_RELAB,
KRAKENHLL_REFSEQ,
KRAKENHLL_REFSEQ_MEDIUM,
KRAKENHLL_REFSEQ_STRICT,
KRAKENHLL_REFSEQ_LONG,
)
[docs]class DataTableFactory:
def __init__(self, packet_dir, metadata_tbl=None):
self.packet_dir = packet_dir
self.metadata = None
if metadata_tbl is not None:
self.set_metadata(metadata_tbl)
[docs] def copy(self, new_metadata=None):
"""Return a deep copy, with a new metadata table. if specified"""
if new_metadata is None:
new_metadata = self.metadata
return type(self)(self.packet_dir, metadata_tbl=new_metadata)
[docs] def csv_in_dir(self, fname, **kwargs):
tbl = pd.read_csv(
join(self.packet_dir, fname),
header=0,
index_col=0,
)
return self.filter_dataframe(tbl, **kwargs)
[docs] def filter_dataframe(self, tbl, **kwargs):
if self.metadata is not None and kwargs.get('metadata_filter', True):
tbl = tbl.loc[set(self.metadata.index) & set(tbl.index)]
if not kwargs.get('no_fill_na', False):
tbl = tbl.fillna(kwargs.get('fillna', 0))
if kwargs.get('remove_zero_cols', True):
tbl = tbl.T.loc[tbl.sum(axis=0) > 0].T
if kwargs.get('remove_zero_rows', True):
tbl = tbl.loc[tbl.sum(axis=1) > 0]
if kwargs.get('normalize', False):
if kwargs['normalize'] == 'subsample':
n, niter = kwargs.get('n', -1), kwargs.get('niter', 1)
tbl = subsample(tbl, n=n, niter=niter)
else:
tbl = proportions(tbl)
return tbl
[docs] def taxonomy(self, **kwargs):
"""Return a taxonomy table."""
tool = kwargs.get('tool', 'krakenhll').lower()
if tool in ['metaphlan2']:
return self.csv_in_dir(MPA_RELAB, **kwargs)
if tool in ['krakenhll', 'krakenuniq']:
strict = kwargs.get('strict', 'permissive')
rank = kwargs.get('rank', 'species')
if rank == 'species' and not isinstance(strict, int):
fname = KRAKENHLL_REFSEQ
if strict.lower() == 'strict':
fname = KRAKENHLL_REFSEQ_STRICT
elif strict.lower() == 'medium':
fname = KRAKENHLL_REFSEQ_MEDIUM
return self.csv_in_dir(fname, **kwargs)
else:
if isinstance(strict, str):
strict = {'strict': 512, 'medium': 256, 'permissive': 4}[strict]
exclude = kwargs.get('exclude_ranks', ['assembly', 'sequence'])
min_cov = kwargs.get('min_cov', 0)
max_read_slope = kwargs.get('max_read_slope', 0)
min_reads = kwargs.get('min_reads', 0)
return parse_longform_taxa(join(
self.packet_dir, KRAKENHLL_REFSEQ_LONG),
strict=strict, rank=rank, exclude_ranks=exclude,
min_cov=min_cov, max_read_slope=max_read_slope,
min_reads=min_reads
)
[docs] def core_taxa(self, core_thresh=0.9, pan_thresh=0.2, zero_thresh=0, **kwargs):
"""Return a pandas dataframe grouping taxa into core, pan, and peripheral."""
taxa = self.taxonomy(**kwargs)
prev = (taxa > zero_thresh).sum(axis=0) / taxa.shape[0]
def pan(val):
if val > core_thresh:
return 'core'
elif val > pan_thresh:
return 'pan'
return 'peripheral'
return prev.apply(pan)
[docs] def amrs(self, **kwargs):
"""Return an AMR table."""
tool = kwargs.get('tool', 'megares').lower()
unit = kwargs.get('unit', 'rpkm').lower()
if tool in ['megares']:
kind = kwargs.get('kind', 'class').lower()
return self.csv_in_dir({
('rpkm', 'class'): MEGARES_CLASS_RPKM,
('rpkm', 'gene'): MEGARES_GENE_RPKM,
('rpkm', 'group'): MEGARES_GROUP_RPKM,
('rpkm', 'mech'): MEGARES_MECH_RPKM,
('rpkmg', 'class'): MEGARES_CLASS_RPKMG,
('rpkmg', 'gene'): MEGARES_GENE_RPKMG,
('rpkmg', 'group'): MEGARES_GROUP_RPKMG,
('rpkmg', 'mech'): MEGARES_MECH_RPKMG,
}[(unit, kind)], **kwargs)
elif tool in ['card']:
return self.csv_in_dir({
'rpkm': CARD_RPKM,
'rpkmg': CARD_RPKMG,
}[unit], **kwargs)
[docs] def pathways(self, coverage_min=0, **kwargs):
"""Return a table of pathways."""
kind = {
'relab': UNIREF90_RELAB,
'cov': UNIREF90_COV,
}[kwargs.get('kind', 'relab').lower()]
tbl = self.csv_in_dir(kind, **kwargs)
if coverage_min > 0:
cov_tbl = self.csv_in_dir(UNIREF90_COV, **kwargs)
tbl = tbl.mask(cov_tbl < coverage_min, other=kwargs.get('other', None))
tbl = self.filter_dataframe(tbl, **kwargs)
return tbl
[docs] def ags(self, **kwargs):
"""Return a Series of Ave Genome Size estimates."""
return self.csv_in_dir(AVE_GENOME_SIZE, **kwargs)
[docs] def hmp(self, **kwargs):
"""Return a table of HMP distances."""
tbl = self.csv_in_dir(HMP_COMPARISON,
metadata_filter=False, remove_zero_rows=False, remove_zero_cols=False,
**kwargs)
if self.metadata is not None:
tbl = tbl.loc[tbl['sample_name'].isin(self.metadata.index)]
return tbl
[docs] def macrobes(self, **kwargs):
"""Return a table of macrobe abundances."""
return self.csv_in_dir(MACROBES, **kwargs)
[docs] def read_props(self, **kwargs):
"""Return a table of the proportion of reads assigned to macro categories."""
return self.csv_in_dir(READ_PROPORTIONS, **kwargs)
[docs] def read_stats(self, **kwargs):
"""Return a table of the gc content and reads counts."""
return self.csv_in_dir(READ_STATS, **kwargs)
[docs] def taxa_alpha_diversity(self, **kwargs):
"""Return a Series of diversities."""
taxa = self.taxonomy(**kwargs)
return self.alpha_diversity(taxa, **kwargs)
[docs] def amr_alpha_diversity(self, **kwargs):
"""Return a Series of diversities."""
amr = self.amrs(**kwargs)
return self.alpha_diversity(amr, **kwargs)
[docs] def alpha_diversity(self, tbl, **kwargs):
"""Return generic alpha diversity table."""
metric = kwargs.get('metric', 'shannon_entropy').lower()
rarefy = kwargs.get('rarefy', 0)
if metric == 'shannon_entropy':
return tbl.apply(shannon_entropy, rarefy=rarefy, axis=1)
elif metric == 'richness':
return tbl.apply(richness, rarefy=rarefy, count=kwargs.get('count', False), axis=1)
elif metric == 'chao1':
return tbl.apply(chao1, rarefy=rarefy, axis=1)
[docs] def taxa_beta_diversity(self, **kwargs):
"""Return a distance matrix between taxa."""
taxa = self.taxonomy(**kwargs)
return self.beta_diversity(taxa, **kwargs)
[docs] def amr_beta_diversity(self, **kwargs):
"""Return a distance matrix between taxa."""
amr = self.amrs(**kwargs)
return self.beta_diversity(amr, **kwargs)
[docs] def beta_diversity(self, tbl, **kwargs):
"""Return generic beta diversity table."""
metric = kwargs.get('metric', 'jsd').lower()
if metric == 'jsd':
metric = jensen_shannon_dist
elif metric == 'rho':
metric = rho_proportionality
distm = squareform(pdist(tbl, metric))
distm = pd.DataFrame(distm, index=tbl.index, columns=tbl.index)
return distm
[docs] def taxa_rarefaction(self, **kwargs):
"""Return a rarefaction analysis of taxa."""
taxa = self.taxonomy(**kwargs)
return self.rarefaction(taxa, **kwargs)
[docs] def amr_rarefaction(self, **kwargs):
"""Return a rarefaction analysis of taxa."""
amr = self.amrs(**kwargs)
return self.rarefaction(amr, **kwargs)
[docs] def rarefaction(self, tbl, **kwargs):
ns = kwargs.get('ns', [])
nsample = kwargs.get('nsample', 16)
include_all = kwargs.get('include_all', True)
return rarefaction_analysis(tbl, ns=ns, nsample=nsample, include_all=include_all)