Source code for blacksheep.classes

from typing import List, Optional, Iterable
import logging
import pandas as pd
from pandas import DataFrame
import numpy as np
from blacksheep._constants import col_seps, col_outlier_suffix, col_not_outlier_suffix, \
    gene_list_file_name


[docs]def list_to_file(lis: Iterable, filename: str): """Takes an iterable and a file path and writes a value per line from the iterable into the new file. Args: lis: Iterable to write to file filename: Filename to write to. Returns: None """ with open(filename, "w") as fh: for x in lis: fh.write("%s\n" % x)
[docs]def make_frac_table(df, samples): """Constructs the fraction table from the outliers table Returns: A DataFrame with one column per sample, with the fraction of outliers per row per sample. This table is useful for visualization but not statistics. """ df = df.copy() cols_outliers = [x + col_seps + col_outlier_suffix for x in samples] cols_notOutliers = [x + col_seps + col_not_outlier_suffix for x in samples] df = df.fillna(0) num_total_psites = df[cols_notOutliers].values + df[cols_outliers].values with np.errstate(divide="ignore", invalid="ignore"): frac_table = df[cols_outliers].values / num_total_psites return DataFrame(frac_table, index=df.index, columns=samples)
[docs]class OutlierTable: """Output of calling outliers. """ def __init__( self, df: DataFrame, updown: str, iqrs: Optional[float], samples: Optional[list], frac_table: Optional[DataFrame], ): """Instantiate an OutlierTable Args: df: DataFrame with outlier and non-outlier columns, and genes/sites as rows. updown: Whether the outliers are above or below the median. Options are "up" or "down" iqrs: The IQR threshold used to call outliers. samples: The samples included in the analysis to define median and IQR. frac_table: DataFrame with samples as columns and genes/sites as rows indicating what fraction of sites per sample were called as outliers. Useful for visualization. """ self.df = df self.up_or_down = updown self.iqrs = iqrs self.samples = samples if frac_table is not None: self.frac_table = frac_table else: self.frac_table = make_frac_table(df, samples)
[docs]class qValues: """Output from comparing groups using outliers. """ def __init__(self, df: DataFrame, comps: list, frac_filter: Optional[float]): """Instantiates a qValues object. Args: df: DataFrame with genes/sites as rows and comparison_group as columns. comps: List of comparisons used to populate table. frac_filter: What fraction of samples in group of interest were required to have an outliers for any given row to be considered for analysis. """ self.df = df self.comps = comps self.frac_filter = frac_filter
[docs] def write_gene_lists( self, fdr_cut_off: float = 0.01, output_prefix: str = "outliers", comparisons: Optional[List] = None, ): """ Writes significant gene list files for every column in a qvalue table Args: fdr_cut_off: FDR threshold for significance output_prefix: Output prefix for files comparisons: which subset of qvalue columns to write gene lists for. Default will write for all columns Returns: None """ if comparisons is None: comparisons = self.df.columns else: comparisons = [ col for col in self.df.columns if (col.startswith(tuple(comparisons))) and (col in self.comps) ] for comp in comparisons: sig_genes = list(self.df.loc[(self.df[comp] < fdr_cut_off), :].index) list_to_file( sig_genes, gene_list_file_name % (output_prefix, comp, fdr_cut_off) )
[docs] def make_signed_logqs(self) -> DataFrame: """Create a DataFrame with signed log10 qvalues for each comparison. E.g. group1 qvalues will be positive, and group 2 qvalues will be negative. Assignment of positive group is based on order in qvalues, could be helpful to negate some columns in output depending on group of interest. Returns: DataFrame with signed qvalues. """ if not (self.comps is None): self.comps = [i.split('_', 1)[1].rsplit('_', 1)[0] for i in self.df.columns] self.comps = sorted(list(set(self.comps))) signed_qs = pd.DataFrame() for comp in self.comps: cols = [ col for col in self.df.columns if col.split('_', 1)[1].rsplit('_', 1)[0] == comp ] if len(cols) > 2 or len(cols) == 0: logging.warning("Excluding %s, %s columns are associated with %s, need 1 or 2 " "columns. Annotation value probably has an _ in it. " % (comp, len(cols), comp)) continue if len(cols) == 1: temp = self.df[cols[0]] elif len(cols) == 2: temp = pd.DataFrame( -np.log10(self.df[cols[1]]).subtract(np.log10(self.df[cols[0]]), fill_value=0), columns=['%s_%s' % (comp, cols[1].rsplit('_', 1)[1])] ) signed_qs = pd.concat([signed_qs, temp], join='outer', axis=1, sort=False) return signed_qs