Source code for blacksheep.simulate

'''
File: simulate.py
Emily Kawaler, last updated 10/01/19

# Do we want to change the rounding to 2 decimal places? It's nicer but provides less information.
# Automatically replace existing pvalue table file?
# Better error detection, especially with the input arguments
# Future, if I'm having a slow week sometime: threading simulations
# Make number of points sampled from the KDE an input parameter (?)
# Make KDE vs true distribution an input parameter (?) 
'''

import random
import sys
import argparse
import numpy as np
from scipy.stats import ttest_1samp
from scipy.stats import percentileofscore
from scipy.stats import gaussian_kde

# Argparser, when testing on its own
def _make_parser():
    parser = argparse.ArgumentParser(prog="blacksheep", description="")
    parser.add_argument("--version", "-v", action="version", version="%(prog)s 0.0.1")

    parser.add_argument(
        "values",
        type=str,
        help="File path to input values. Samples are columns and genes/sites are rows. Only .tsv "
             "and .csv accepted.",
    )
    parser.add_argument(
        "--ind_sep",
        type=str,
        default="-",
        help="Delimiter between the parent molecule (e.g. a gene name such "
             "as ATM) and a site identifier (e.g. S365). Default is -",
    )
    parser.add_argument(
        "--iqrs",
        type=float,
        default=1.5,
        help="Number of inter-quartile ranges (IQRs) above or below the "
             "median to consider a value an outlier. Default is 1.5.",
    )
    parser.add_argument(
        "--reps",
        type=int,
        default=1000000,
        help="Number of repetitions for the simulation to perform. "
             "Default is 1,000,000.",
    )
    parser.add_argument(
        "--output_prefix",
        type=str,
        default="simulated_pvals",
        help="Output prefix for writing files. Default is 'simulated_pvals'.",
    )
    parser.add_argument(
        "--molecules",
        nargs='+',
        default=[],
        help="List of parent molecules of interest. Empty list or absence of "
             "argument defaults to all parent molecules in input file.",
    )
    parser.add_argument(
        "--pval",
        type=float,
        default=0.05,
        help="p-value threshold for significant results. Must be between 0 and 1."
        "Default is 0.05.",
    )

    return parser

def get_full_gene_list(ind_sep, infile):
	# Gets a list of all molecules in the file
	f = open(infile, 'r')
	genes = set([])
	line = f.readline() # Header line
	line = f.readline()
	while line:
		genes.add(line.split()[0].split(ind_sep)[0])
		line = f.readline()
	return genes


def get_values(gene, ind_sep, infile):
	# Digs around in the input file for the specified gene
	f = open(infile,'r')
	values = {} # only non-missing values
	missings = {} # missing values
	all_values = {} # all values, including missing and present
	line = f.readline()
	total = len(line.split())-1
	while line:
		if line.split(ind_sep)[0] == gene and len(line.split()) > 1:
			temp_vals = [float(num) for num in line.split()[1:]]
			if len(temp_vals) > 1:
				values[line.split()[0]] = [float(num) for num in line.split()[1:]]
				missings[line.split()[0]] = 1-(len(values[line.split()[0]])/float(total))
				all_values[line.split()[0]] = line.rstrip('\n').split('\t')[1:]
		line = f.readline()
		
	return values, missings, all_values


def outlier_thresholds(values, thresh):
	# Fixes the value that is (threshold) IQR above the median for each phospho
	o_thresh = {}
	for val in values.keys():
		dist = values[val]
		iqr = np.percentile(dist,75)-np.percentile(dist,25)
		outlier_threshold = np.percentile(dist,50)+(iqr*thresh)
		o_thresh[val] = outlier_threshold
	return o_thresh


def simulate(values, missings, o_thresh, reps):
	# Generates a random value for each p-site and counts outliers
	# Does this (reps) number of times
	r = random.Random()
	ol_list = []
	for i in range(reps):
		outliers = 0
		for val in values.keys():
			m = r.uniform(0.0,1.0)
			if m < missings[val]:
				# Makes sure to allow for missing psites.
				continue
			u = r.choice(values[val])
			if u > o_thresh[val]:
				outliers +=1
		ol_list.append(outliers)
	return ol_list


def simulate_kde(values, missings, o_thresh, reps):
	# Generates a KDE for each p-site from the existing values,
	# generates a random value for each p-site from that KDE, and counts outliers
	# Does this (reps) number of times
	r = random.Random()
	ol_list = []
	stored_kdes = {}
	for val in values.keys():
		# Make the size an editable parameter?
		stored_kdes[val] = gaussian_kde(values[val]).resample(size=reps)[0]
	for i in range(reps):
		outliers = 0
		for val in values.keys():
			m = r.uniform(0.0,1.0)
			if m < missings[val]:
				# Makes sure to allow for missing psites.
				continue
			# I used to pick randomly from the stored KDEs but realized that would
			# skew the distribution toward the mode, probably?
			u = stored_kdes[val][i]
			if u > o_thresh[val]:
				outliers +=1
		ol_list.append(outliers)
	return ol_list


def alpha_thresh(ol_dist, pval):
	# Spits out outlier number of outliers
	pv = 100-(100*pval)
	return np.percentile(ol_dist,pv)
	#return np.mean(ol_dist)+1.64*np.std(ol_dist) # I think this is the 0.05, if you have a normal distribution, which you probably won't. 

def generate_output_line(o_thresh, values, ol_dist, pval):
	# Determines the p-value for each sample being significantly hyperphosphorylated
	# for the given gene.
		
	output_list = []

	for s in range(len(list(values.values())[0])): # for each sample
		tot_outliers = 0
		for o in o_thresh.keys():
			if values[o][s]:
				if float(values[o][s]) > o_thresh[o]:
					tot_outliers += 1
		pv = round(((100-percentileofscore(ol_dist,tot_outliers-1,kind='weak'))/100.0),3) # pval for i+1 outliers
		if pv <= pval:
			output_list.append(str(pv))
		else:
			output_list.append('NS')
		
	return output_list
	
[docs]def run_simulations(infile, ind_sep, thresh, reps, outfile, genes, pval):
	w = open(outfile+"_pvals.tsv", 'w')
	
	# writing header to file
	f = open(infile, 'r')
	w.write(f.readline())
	f.close()
	
	if len(genes) == 0:
		genes = get_full_gene_list(ind_sep, infile)
	
	for gene in genes: # Can make this much more efficient
		# Get values for your gene from the input file
		values, missings, all_values = get_values(gene, ind_sep, infile)
		# Figure out the outlier threshold for each phosphosite
		o_thresh = outlier_thresholds(values, thresh)
		# Do the actual simulation
		ol_dist = simulate_kde(values, missings, o_thresh, reps)
		# Grab out the significance threshold for outliers
		out_line = alpha_thresh(ol_dist, pval)
		
		# Write to the output file
		if len(all_values) > 0: # won't write it out if the gene has no phosphosites with more than one value
			w.write(gene+'\t')
			to_write = generate_output_line(o_thresh, all_values, ol_dist, pval)
			w.write('\t'.join(to_write)+'\n')
		
	w.close()

if __name__=='__main__':

	args = sys.argv[1:]
	args = _make_parser().parse_args(args) 
    
	run_simulations(
            args.values,
            args.ind_sep,
            args.iqrs,
            int(args.reps),
            args.output_prefix,
            args.molecules,
            args.pval,
        )