Source code for capalyzer.packet_parser.longform_parser

import pandas as pd
import gzip
import csv


[docs]def parse_longform_taxa(filename, rank='all', strict=512, min_reads=0, min_cov=0, max_read_slope=0, exclude_ranks=['assembly', 'sequence']): """Return a pandas dataframe.""" tbl = {} with gzip.open(filename, 'r') as longform: longform.readline() for line in longform: tkns = list(csv.reader([line.decode('utf-8')]))[0] sample_name, taxa_name, taxa_rank = tkns[0], tkns[1], tkns[3] try: nkmers, nreads, cov = int(tkns[6]), int(tkns[5]), tkns[8] try: cov = float(cov) except ValueError: cov = 0 read_slope = nreads / nkmers except ValueError: print(line) raise if rank and rank != 'all' and rank != taxa_rank: continue if strict and nkmers < strict: continue if min_cov and cov < min_cov: continue if min_reads and nreads < min_reads: continue if max_read_slope and read_slope > max_read_slope: if cov < 0.9: continue if taxa_rank in exclude_ranks: continue sample_tbl = tbl.get(sample_name, {}) sample_tbl[taxa_name] = nreads tbl[sample_name] = sample_tbl return pd.DataFrame.from_dict(tbl, orient='index').fillna(0)