Source code for blacksheep.parsers

import os.path
import pandas as pd
import numpy as np
from pandas import DataFrame
from typing import Iterable
from blacksheep.classes import OutlierTable
from blacksheep._constants import *


def _is_valid_file(arg: str) -> str:
    """Checks if file exists (probably pretty redundant except as a type checker in argparse)

    Args:
        arg: File path

    Returns: arg
        Validated file path

    """
    if not os.path.exists(arg):
        raise FileNotFoundError("%s does not exist" % arg)
    return arg


def _check_output_prefix(arg: str) -> str:
    """Checks if output prefix is valid

    Args:
        arg: Output prefix

    Returns: arg
        Validation output prefix

    """

    if "/" in arg:
        prefix = arg.rsplit("/", 1)[0]
        _is_valid_file(prefix)
    return arg


def _check_suffix(path: str) -> str:
    """Checks that file is a .csv or .tsv file, and returns which sep to use

    Args:
        path: File path

    Returns: sep
        Sep to use for parsing

    """

    if path[-4:] == ".tsv":
        return "\t"
    if path[-4:] == ".csv":
        return ","
    raise ValueError("File must be .csv or .tsv")


[docs]def read_in_values(path: str) -> DataFrame:
    """Figures out sep and parsing file into dataframe.

    Args:
        path: File path

    Returns: df
        DataFrame from table in file

    """
    sep = _check_suffix(path)
    return pd.read_csv(_is_valid_file(path), sep=sep, index_col=0)


[docs]def read_in_outliers(path: str, updown: str, iqrs: float) -> OutlierTable:
    """Parses a file into an OutlierTable object.

    Args:
        path: File path
        updown: Whether the outliers represent up or down outliers
        iqrs: How many IQRs were used to define an outlier

    Returns: outliers
        OutlierTable object

    """

    sep = _check_suffix(path)
    df = pd.read_csv(_is_valid_file(path), sep=sep, index_col=0)
    samples = sorted(list(set([ind.rsplit(col_seps, 1)[0] for ind in df.columns])))
    return OutlierTable(df, updown, iqrs, samples, None)


[docs]def binarize_annotations(df: DataFrame) -> DataFrame:
    """Takes an annotation DataFrame, checks each column for the number of possible values,
    and adjusts based on that. If the column has 0 or 1 options, it is dropped. Cols with 2
    possible values are retained as-is. Cols with more than 2 values are expanded. For each
    value in that column, a new column is created with val and not_val options.

    Args:
        df: Annotations DataFrame.

    Returns: new_df
        Refactored annotations DataFrame.
    """

    new_df = pd.DataFrame(index=df.index)
    for col in df.columns:
        if len(df[col].dropna().value_counts().keys()) == 2:
            new_df[col] = df[col]
        elif len(df[col].dropna().value_counts().keys()) > 2:
            for val in df[col].dropna().value_counts().keys():
                val = str(val).replace("_", "-")
                new_df.loc[(df[col] != val), binarized_col_name % (col, val)] = (
                    outgroup_val % val
                )
                new_df.loc[(df[col] == val), binarized_col_name % (col, val)] = val
                new_df.loc[df[col].isnull(), binarized_col_name % (col, val)] = np.nan
    return new_df


[docs]def normalize(df: DataFrame) -> DataFrame:
    """Performs median of ratios normalization on a given dataframe, then a log2 transform.

    Args:
        df: Unnormalized values dataframe

    Returns: Normalized dataframe

    """
    return np.log2(df.divide(
        df.divide(
            df.mean(axis=1), axis=0).replace([np.inf, -np.inf], np.nan).median()
    ))


[docs]def subset_by_genes(
        outliers: DataFrame, ind_list: Iterable[str], ind_sep: str = None,
        ) -> DataFrame:
    if ind_sep:
        return outliers.loc[[i for i in outliers.index if i.split(ind_sep, 1)[0] in ind_list], :]
    return outliers.loc[ind_list, :]