This file shows how to get habit-complete and habit-skip pattern for any habit¶

what you need: clearned datasets¶

import numpy as np
import pandas as pd
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')

# load cleaned data, it can be one month data or combined data
fname = 'FB_data_ML_with_uid_201508.csv'
DF00 = pd.read_csv('./' + fname)
DF00.drop('Unnamed: 0',1,inplace=True)   # not necessary, I just don't like this first column
print DF00.shape

(272199, 6)

# rank skipped habit by how many time each one got skipped
hbt_skp_count = pd.DataFrame(DF00[DF00['Action'] == 1].Habit.value_counts())
hbt_skp_count.reset_index(inplace = True)
hbt_skp_count.columns = ['Habit','num_skp']
hbt_skp_count.head(10) # change the number to view more/less

# enter here the habit you want to look at
# (use the index number before its name, 0 means the first one)
hbt_name = hbt_skp_count.values[0,0]
print 'looking at habit: "' + hbt_name + '"'
hbt_recd = DF00[DF00['Habit'] == hbt_name]

looking at habit: "Exercise"

# completed habit record, in format of day of week
hbt_recd_cmp = hbt_recd[hbt_recd['Action'] == 0].Day_of_Week
# skipped habit record, in format of day of week
hbt_recd_skp = hbt_recd[hbt_recd['Action'] == 1].Day_of_Week

# plot complete pattern
CMP_plt= pd.DataFrame(hbt_recd_cmp.value_counts())
CMP_plt.reset_index(inplace = True)
CMP_plt.columns = ['Day_of_Week','num']
CMP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Completed habit record:'
print CMP_plt
CMP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
             fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of events')
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- complete pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_cmp.png')

Completed habit record:
   Day_of_Week   num
0            0  3401
1            1  2853
3            2  2727
2            3  2772
4            4  2591
6            5  2520
5            6  2567

# plot skip pattern
SKP_plt = pd.DataFrame(hbt_recd_skp.value_counts())
SKP_plt.reset_index(inplace = True)
SKP_plt.columns = ['Day_of_Week','num']
SKP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Skipped habit record:'
print SKP_plt
SKP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
             fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of skipped times' )
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- skip pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_skp.png')

Skipped habit record:
   Day_of_Week  num
0            0  528
4            1  390
1            2  430
3            3  399
2            4  413
5            5  382
6            6  367

To calculate the significant, using chi2 test¶

# for complete pattern
ct = hbt_recd_cmp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the completed habit pattern is significant (p = %.3f)' %(p)
else: print 'the completed habit pattern is NOT significant (p = %.3f)' %(p)

# for skip pattern
ct = hbt_recd_skp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the skipped habit pattern is significant (p = %.3f)' %(p)
else: print 'the skipped habit pattern is NOT significant (p = %.3f)' %(p)

1.80419351786e-39
the completed habit pattern is significant (p = 0.000)
2.24927084379e-07
the skipped habit pattern is significant (p = 0.000)

Bellow is some combined code (started with In[4]). You only need to change the "n" vaue in the first line to get the patterns for the corresponding habit.¶

# enter here the habit you want to look at
# (use the index number before its name, 0 means the first one)
n = 2
hbt_name = hbt_skp_count.values[n,0]
print 'looking at habit: "' + hbt_name + '"'
hbt_recd = DF00[DF00['Habit'] == hbt_name]

# completed habit record, in format of day of week
hbt_recd_cmp = hbt_recd[hbt_recd['Action'] == 0].Day_of_Week
# skipped habit record, in format of day of week
hbt_recd_skp = hbt_recd[hbt_recd['Action'] == 1].Day_of_Week

# plot complete pattern
CMP_plt= pd.DataFrame(hbt_recd_cmp.value_counts())
CMP_plt.reset_index(inplace = True)
CMP_plt.columns = ['Day_of_Week','num']
CMP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Completed habit record:'
print CMP_plt
CMP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
             fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of events')
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- complete pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_cmp.png')

# plot skip pattern
SKP_plt = pd.DataFrame(hbt_recd_skp.value_counts())
SKP_plt.reset_index(inplace = True)
SKP_plt.columns = ['Day_of_Week','num']
SKP_plt.sort('Day_of_Week', ascending = True, inplace = True)
print 'Skipped habit record:'
print SKP_plt
SKP_plt.plot(x = 'Day_of_Week', y = 'num', kind = 'bar', width=0.8, figsize = (12.0, 8.0),
             fontsize = 22, legend=False, xlim = [-0.5, 6.5])
plt.ylabel('Number of skipped times' )
plt.xlabel('')
plt.title('Habit: "'+ hbt_name + '" -- skip pattern')
locs, labels = plt.xticks(np.arange(0,7), ('Mon','Tue','Wed','Thu','Fri','Sat','Sun'))
plt.setp(labels, rotation=45)
plt.savefig('./Figures/' + fname[20:-4] +'_' + hbt_name + '_skp.png')

# for complete pattern
ct = hbt_recd_cmp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the completed habit pattern is significant (p = %.3f)' %(p)
else: print 'the completed habit pattern is NOT significant (p = %.3f)' %(p)

# for skip pattern
ct = hbt_recd_skp.value_counts().values
chi2, p = stats.chisquare(ct)
print p
if p < 0.05: print 'the skipped habit pattern is significant (p = %.3f)' %(p)
else: print 'the skipped habit pattern is NOT significant (p = %.3f)' %(p)

looking at habit: "Eat a Great Breakfast"
Completed habit record:
   Day_of_Week   num
0            0  4847
3            1  3770
1            2  3844
2            3  3781
5            4  3746
6            5  3684
4            6  3756
Skipped habit record:
   Day_of_Week  num
0            0  283
6            1  206
5            2  230
2            3  244
3            4  240
1            5  268
4            6  231
2.62807192653e-53
the completed habit pattern is significant (p = 0.000)
0.0131792662973
the skipped habit pattern is significant (p = 0.013)

	Habit	num_skp
0	Exercise	2909
1	Drink Water	2772
2	Eat a Great Breakfast	1702
3	Meditate	1305
4	Clean & Tidy up	727
5	Stretch	695
6	Yoga	695
7	Shower	682
8	Read	665
9	Floss	529