import numpy as np
import pandas as pd
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
# load cleaned data, it can be one month data or combined data
fname = 'FB_data_ML_with_uid_201508.csv'
DF00 = pd.read_csv('./' + fname)
DF00.drop('Unnamed: 0',1,inplace=True) # not necessary, I just don't like this first column
print DF00.shape
# rank skipped habit by how many time each one got skipped
hbt_skp_count = pd.DataFrame(DF00[DF00['Action'] == 1].Habit.value_counts())
hbt_skp_count.reset_index(inplace = True)
hbt_skp_count.columns = ['Habit','num_skp']
hbt_skp_count.head(10) # change the number to view more/less
# enter here the habit you want to look at
# (use the index number before its name, 0 means the first one)
hbt_name = hbt_skp_count.values[0,0]
print 'looking at habit: "' + hbt_name + '"'
hbt_recd = DF00[DF00['Habit'] == hbt_name]
# completed habit record, in format of day of week
hbt_recd_cmp = hbt_recd[hbt_recd['Action'] == 0].Day_of_Week
# skipped habit record, in format of day of week
hbt_recd_skp = hbt_recd[hbt_recd['Action'] == 1].Day_of_Week
# plot complete pattern
CMP_plt= pd.DataFrame(hbt_recd_cmp.value_counts())
CMP_plt.reset_index(inplace = True)
CMP_plt.columns = ['Day_of_Week','num']
CMP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Completed habit record:'
print CMP_plt
CMP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of events')
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- complete pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_cmp.png')
# plot skip pattern
SKP_plt = pd.DataFrame(hbt_recd_skp.value_counts())
SKP_plt.reset_index(inplace = True)
SKP_plt.columns = ['Day_of_Week','num']
SKP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Skipped habit record:'
print SKP_plt
SKP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of skipped times' )
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- skip pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_skp.png')
# for complete pattern
ct = hbt_recd_cmp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the completed habit pattern is significant (p = %.3f)' %(p)
else: print 'the completed habit pattern is NOT significant (p = %.3f)' %(p)
# for skip pattern
ct = hbt_recd_skp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the skipped habit pattern is significant (p = %.3f)' %(p)
else: print 'the skipped habit pattern is NOT significant (p = %.3f)' %(p)
# enter here the habit you want to look at
# (use the index number before its name, 0 means the first one)
n = 2
hbt_name = hbt_skp_count.values[n,0]
print 'looking at habit: "' + hbt_name + '"'
hbt_recd = DF00[DF00['Habit'] == hbt_name]
# completed habit record, in format of day of week
hbt_recd_cmp = hbt_recd[hbt_recd['Action'] == 0].Day_of_Week
# skipped habit record, in format of day of week
hbt_recd_skp = hbt_recd[hbt_recd['Action'] == 1].Day_of_Week
# plot complete pattern
CMP_plt= pd.DataFrame(hbt_recd_cmp.value_counts())
CMP_plt.reset_index(inplace = True)
CMP_plt.columns = ['Day_of_Week','num']
CMP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Completed habit record:'
print CMP_plt
CMP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of events')
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- complete pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_cmp.png')
# plot skip pattern
SKP_plt = pd.DataFrame(hbt_recd_skp.value_counts())
SKP_plt.reset_index(inplace = True)
SKP_plt.columns = ['Day_of_Week','num']
SKP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Skipped habit record:'
print SKP_plt
SKP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of skipped times' )
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- skip pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_skp.png')
# for complete pattern
ct = hbt_recd_cmp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the completed habit pattern is significant (p = %.3f)' %(p)
else: print 'the completed habit pattern is NOT significant (p = %.3f)' %(p)
# for skip pattern
ct = hbt_recd_skp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the skipped habit pattern is significant (p = %.3f)' %(p)
else: print 'the skipped habit pattern is NOT significant (p = %.3f)' %(p)