Use Python to analyse Australia car accidents Series 4: confidence interval

import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.stats import norm
import math as m
import random


crash = pd.read_table('crash.csv', sep=',', engine='python')

crash['Age']=crash['Age'].replace('Unknown', 0)
crash['Age'] = pd.to_numeric(crash['Age'])
valid_age = crash[crash['Age']>0]['Age']

#assume population standard deviation is equal to data's std 
#mean of population range 
def z_confidence_interval(data, confidence=0.95):
    z=stats.norm.ppf(q=1-(1-confidence)/2)
    data_desc=data.describe()
    E=z*data_desc['std']/m.sqrt(data_desc['count'])

    return (data_desc['mean']-E, data_desc['mean']+E)
print z_confidence_interval(valid_age)

#sample size needed for confidence 95% and max age margin error in 2 
def z_sample_size(data, E=2, confidence=0.95):
    z=stats.norm.ppf(q=1-(1-confidence)/2)
    data_desc=data.describe()

    return m.pow(z*data_desc['std']/E,2)
print z_sample_size(valid_age)

# use t distribution to get population mean confidence interval
# we have known the data is standard distribution, n < 30 
def t_confidence_interval(data, n=29, confidence=0.95): 
    x=[random.randint(0,len(data)) for i in range(n)] 
    data_desc=data[x].describe() 
    t=stats.t.ppf(confidence, n-1) E=t*data_desc['std']/m.sqrt(n) 
    return (data_desc['mean']-E, data_desc['mean']+E) 
print t_confidence_interval(valid_age) 

#proportion confidence interval, n*p>5, n*q>5 
def proportion_confidence_interval(data, confidence=0.90):
    z=stats.norm.ppf(q=1-(1-confidence)/2)
    data_desc=data.describe()
    p=float(data_desc['freq'])/data_desc['count']
    q=1-p
    E=z*m.sqrt(p*q/len(data))

    return (p-E, p+E)
print proportion_confidence_interval(crash['Gender'])

#proportion confidence interval, n*p>5, n*q>5 
def proportion_confidence_interval(data, confidence=0.90):
    z=stats.norm.ppf(q=1-(1-confidence)/2)
    data_desc=data.describe()
    p=float(data_desc['freq'])/data_desc['count']
    q=1-p
    E=z*m.sqrt(p*q/len(data))

    return (p-E, p+E)
print proportion_confidence_interval(crash['Gender'])

#get proportion sample size 
def proportion_sample_size(data, confidence=0.90, E=0.02):
    z=stats.norm.ppf(q=1-(1-confidence)/2)
    data_desc=data.describe()
    p=float(data_desc['freq'])/data_desc['count']
    q=1-p

    return p*q*m.pow(z/E,2)
print proportion_sample_size(crash['Gender'])

#chi square variance, std confidence interval
def chi_square_confidence_interval(data, confidence=0.90):
    data_desc=data.describe()
    chi_right=stats.chi2.ppf(q=float(1-confidence)/2, df=len(data))
    chi_left=stats.chi2.ppf(q=(1-float(1-confidence)/2), df=len(data))

    return [((len(data)-1)*m.pow(data_desc['std'],2)/chi_right, (len(data)-1)*m.pow(data_desc['std'],2)/chi_left), m.sqrt(((len(data)-1)*m.pow(data_de
sc['std'],2)/chi_right)), m.sqrt(((len(data)-1)*m.pow(data_desc['std'],2)/chi_left))]
print chi_square_confidence_interval(valid_age)