# Compiled by Randal S. Olson # Description: Useful functions for statistics and data parsing that I've written or run across during my research # Handy function for parsing lists from a csv reader into a string-referenced dictionary # Input: parsed list from csv.reader() # Returns: dictionary of data lists, list of header strings in same order as csv file def parse_csv_data(data_list): data_dict = {} headers = [] first_row = True for row in data_list: # first row has headers if first_row: first_row = False for header in row: colname = str(header) headers.append(colname) data_dict[colname] = [] else: colnum = 0 for datapoint in row: data_dict[headers[colnum]].append(int(datapoint)) colnum += 1 return data_dict, headers # Confidence interval bootstrapping function # Written by: cevans # URL: https://bitbucket.org/cevans/bootstrap/ from numpy.random import randint from scipy.stats import norm from numpy import * def ci(data, statfun, alpha=0.05, n_samples=10000, method='bca'): # Ensure that our data is, in fact, an array. data = array(data) # First create array of bootstrap sample indexes: indexes = randint(data.shape[0],size=(n_samples,data.shape[0])) # Then apply this to get the bootstrap samples and statistics over them. samples = data[indexes] stat = array([statfun(x) for x in samples]) # Normal-theory Interval --- doesn't use sorted statistics. if method == 'nti': bstd = std(stat) pass stat_sorted = sort(stat) # Percentile Interval if method == 'pi': return ( stat_sorted[round(n_samples*alpha/2)], stat_sorted[round(n_samples*(1-alpha/2))] ) # Bias-Corrected Accelerated Interval elif method == 'bca': ostat = statfun(data) z = norm.ppf( ( 1.0*sum(stat < ostat) + 0.5*sum(stat == ostat) ) / (n_samples + 1) ) # Calculate the jackknife distribution and corresponding statistics quickly. j_indexes = (lambda n: delete(tile(array(range(0,n)),n),range(0,n*n,n+1)).reshape((n,n-1)))(len(data)) jstat = [statfun(x) for x in data[j_indexes]] jmean = mean(jstat) a = sum( (jstat - jmean)**3 ) / ( 6.0 * sum( (jstat - jmean)**2 )**1.5 ) zp = z + norm.ppf(1-alpha/2) zm = z - norm.ppf(1-alpha/2) a1 = norm.cdf(z + zm/(1-a*zm)) a2 = norm.cdf(z + zp/(1-a*zp)) return (stat_sorted[round(n_samples*a1)],stat_sorted[round(n_samples*a2)]) else: raise "Method %s not supported" % method # Mangles the CIs from the ci() function into a format that matplotlib's errorbar() function likes # Input: # data = data to get bootstrapped CIs for # statfun = function to compute CIs over (usually, mean) # alpha = size of CIs (0.05 --> 95% CIs) # n_samples = # of bootstrap populations to construct # Returns: CI values formatted for matplotlib errorbar() function def ci_errorbar(data, statfun, alpha=0.05, n_samples=10000, method='bca'): data = list(data) stat = statfun(data) cie = list(ci(data, statfun, alpha, n_samples, method)) low = stat - float(cie[0]) high = float(cie[1]) - stat return [[low], [high]]