import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import norm
import math as m
import random
crash = pd.read_table('crash.csv', sep=',', engine='python')
crash['Age']=crash['Age'].replace('Unknown', 0)
crash['Age'] = pd.to_numeric(crash['Age'])
valid_age = crash[crash['Age']>0]['Age']
#assume population standard deviation is equal to data's std
#mean of population range
def z_confidence_interval(data, confidence=0.95):
z=stats.norm.ppf(q=1-(1-confidence)/2)
data_desc=data.describe()
E=z*data_desc['std']/m.sqrt(data_desc['count'])
return (data_desc['mean']-E, data_desc['mean']+E)
print z_confidence_interval(valid_age)
#sample size needed for confidence 95% and max age margin error in 2
def z_sample_size(data, E=2, confidence=0.95):
z=stats.norm.ppf(q=1-(1-confidence)/2)
data_desc=data.describe()
return m.pow(z*data_desc['std']/E,2)
print z_sample_size(valid_age)
# use t distribution to get population mean confidence interval
# we have known the data is standard distribution, n < 30
def t_confidence_interval(data, n=29, confidence=0.95):
x=[random.randint(0,len(data)) for i in range(n)]
data_desc=data[x].describe()
t=stats.t.ppf(confidence, n-1) E=t*data_desc['std']/m.sqrt(n)
return (data_desc['mean']-E, data_desc['mean']+E)
print t_confidence_interval(valid_age)
#proportion confidence interval, n*p>5, n*q>5
def proportion_confidence_interval(data, confidence=0.90):
z=stats.norm.ppf(q=1-(1-confidence)/2)
data_desc=data.describe()
p=float(data_desc['freq'])/data_desc['count']
q=1-p
E=z*m.sqrt(p*q/len(data))
return (p-E, p+E)
print proportion_confidence_interval(crash['Gender'])
#proportion confidence interval, n*p>5, n*q>5
def proportion_confidence_interval(data, confidence=0.90):
z=stats.norm.ppf(q=1-(1-confidence)/2)
data_desc=data.describe()
p=float(data_desc['freq'])/data_desc['count']
q=1-p
E=z*m.sqrt(p*q/len(data))
return (p-E, p+E)
print proportion_confidence_interval(crash['Gender'])
#get proportion sample size
def proportion_sample_size(data, confidence=0.90, E=0.02):
z=stats.norm.ppf(q=1-(1-confidence)/2)
data_desc=data.describe()
p=float(data_desc['freq'])/data_desc['count']
q=1-p
return p*q*m.pow(z/E,2)
print proportion_sample_size(crash['Gender'])
#chi square variance, std confidence interval
def chi_square_confidence_interval(data, confidence=0.90):
data_desc=data.describe()
chi_right=stats.chi2.ppf(q=float(1-confidence)/2, df=len(data))
chi_left=stats.chi2.ppf(q=(1-float(1-confidence)/2), df=len(data))
return [((len(data)-1)*m.pow(data_desc['std'],2)/chi_right, (len(data)-1)*m.pow(data_desc['std'],2)/chi_left), m.sqrt(((len(data)-1)*m.pow(data_de
sc['std'],2)/chi_right)), m.sqrt(((len(data)-1)*m.pow(data_desc['std'],2)/chi_left))]
print chi_square_confidence_interval(valid_age)