import numpy as np
import pandas as pd
import scipy.stats as stats
import math
import random
crash = pd.read_csv('crash.csv')
crash['Age']=crash['Age'].replace('Unknown', 0)
crash['Age'] = pd.to_numeric(crash['Age'])
valid_age = crash[crash['Age']>0]['Age']
std_age = valid_age.describe()['std']
mean_age = valid_age.describe()['mean']
# z test for mean, sample size > 30 and population standard deviation is known, normal distribution
# HO: average car accident age is 20, H1: average car accident age > 20
# this is a right tail test, type I error propability is assumed 0.05
sample=[random.randint(0,len(valid_age)) for i in range(30)]
sample_desc=valid_age[sample].describe()
z=(sample_desc['mean']-20)/(std_age/math.sqrt(30))
if z > abs(stats.norm.ppf(0.05)):
print 'H0 is true, we can claim average car accident age is 20.'
else:
print 'H1 is true, average car accident age is greater than 20.'
#Another assume H1: average car accident age < 25, H0: average car accident age is 25
z=(sample_desc['mean']-25)/(std_age/math.sqrt(30))
if stats.norm.ppf(0.05) < z:
print 'H1 is true, average car accident age is less than 25.'
#Another assume H1: average car accident age is not 25
if abs(z) > stats.norm.ppf(0.05):
print 'H1 is true, average car accident age is not 25.'
else:
print 'H0 is true, average car accident age is 25.'
# t hypothesis test is similar to z test, difference is in t test population standard deviation is unknown and t test need consider degree freedom
#Another assume 70% car accident agender is male, operation on z proportion test is similar to z mean test
valid_gender = crash['Gender']
sample=[random.randint(0,len(valid_gender)) for i in range(300)]
sample_desc=valid_gender[sample].describe()
z=(float(sample_desc['freq'])/sample_desc['count']-0.7)/math.sqrt(0.7*0.3/sample_desc['count'])
if abs(stats.norm.ppf(0.05)) > z:
print 'assume 70% car accident agender is male is correct.'
else:
print 'assume 70% car accident agender is male is incorrect.'
# chi-square test for variance claim (no example)