Source code for blacksheep.parsers

import os.path
import pandas as pd
import numpy as np
from pandas import DataFrame
from typing import Iterable
from blacksheep.classes import OutlierTable
from blacksheep._constants import *


def _is_valid_file(arg: str) -> str:
    """Checks if file exists (probably pretty redundant except as a type checker in argparse)

    Args:
        arg: File path

    Returns: arg
        Validated file path

    """
    if not os.path.exists(arg):
        raise FileNotFoundError("%s does not exist" % arg)
    return arg


def _check_output_prefix(arg: str) -> str:
    """Checks if output prefix is valid

    Args:
        arg: Output prefix

    Returns: arg
        Validation output prefix

    """

    if "/" in arg:
        prefix = arg.rsplit("/", 1)[0]
        _is_valid_file(prefix)
    return arg


def _check_suffix(path: str) -> str:
    """Checks that file is a .csv or .tsv file, and returns which sep to use

    Args:
        path: File path

    Returns: sep
        Sep to use for parsing

    """

    if path[-4:] == ".tsv":
        return "\t"
    if path[-4:] == ".csv":
        return ","
    raise ValueError("File must be .csv or .tsv")


[docs]def read_in_values(path: str) -> DataFrame: """Figures out sep and parsing file into dataframe. Args: path: File path Returns: df DataFrame from table in file """ sep = _check_suffix(path) return pd.read_csv(_is_valid_file(path), sep=sep, index_col=0)
[docs]def read_in_outliers(path: str, updown: str, iqrs: float) -> OutlierTable: """Parses a file into an OutlierTable object. Args: path: File path updown: Whether the outliers represent up or down outliers iqrs: How many IQRs were used to define an outlier Returns: outliers OutlierTable object """ sep = _check_suffix(path) df = pd.read_csv(_is_valid_file(path), sep=sep, index_col=0) samples = sorted(list(set([ind.rsplit(col_seps, 1)[0] for ind in df.columns]))) return OutlierTable(df, updown, iqrs, samples, None)
[docs]def binarize_annotations(df: DataFrame) -> DataFrame: """Takes an annotation DataFrame, checks each column for the number of possible values, and adjusts based on that. If the column has 0 or 1 options, it is dropped. Cols with 2 possible values are retained as-is. Cols with more than 2 values are expanded. For each value in that column, a new column is created with val and not_val options. Args: df: Annotations DataFrame. Returns: new_df Refactored annotations DataFrame. """ new_df = pd.DataFrame(index=df.index) for col in df.columns: if len(df[col].dropna().value_counts().keys()) == 2: new_df[col] = df[col] elif len(df[col].dropna().value_counts().keys()) > 2: for val in df[col].dropna().value_counts().keys(): val = str(val).replace("_", "-") new_df.loc[(df[col] != val), binarized_col_name % (col, val)] = ( outgroup_val % val ) new_df.loc[(df[col] == val), binarized_col_name % (col, val)] = val new_df.loc[df[col].isnull(), binarized_col_name % (col, val)] = np.nan return new_df
[docs]def normalize(df: DataFrame) -> DataFrame: """Performs median of ratios normalization on a given dataframe, then a log2 transform. Args: df: Unnormalized values dataframe Returns: Normalized dataframe """ return np.log2(df.divide( df.divide( df.mean(axis=1), axis=0).replace([np.inf, -np.inf], np.nan).median() ))
[docs]def subset_by_genes( outliers: DataFrame, ind_list: Iterable[str], ind_sep: str = None, ) -> DataFrame: if ind_sep: return outliers.loc[[i for i in outliers.index if i.split(ind_sep, 1)[0] in ind_list], :] return outliers.loc[ind_list, :]