This file shows how I read and reformat the data for future use. And I'll refer the result as "clearned data" in the other instructions.¶

import pandas as pd
from datetime import datetime

DF00 = pd.read_csv('./data/2015-06-new.csv',low_memory=False)

# load useful column names, to subset df00
namelist = pd.read_csv('./column_names_for_ML.txt')
useful_column_names = namelist['col_name']
DF01 = DF00[useful_column_names]
print DF01.shape
DF01.head()
# as you can see, there are rows with all NaN

(652133, 7)

# first, make all gender lowercase
DF01['property_gender'] = DF01['property_gender'].str.lower() # this line causes the warning msg below

# then re-assign NaN value as 'InputNAN' for futrue use, argumentable
DF02 = DF01.fillna(value='InputNAN')

# divide DF02 into two dfs, one for skipped hbt, one for completed hbt
DF02_skp = DF02.drop(['property_lastHabitCompleted','property_lastHabitCompletedDate'],axis = 1)
DF02_cmp = DF02.drop(['property_lastHabitSkipped','property_lastHabitSkippedDate'],axis = 1)
print DF02.shape
print DF02_cmp.shape
print DF02_skp.shape

# drop duplicated rows in each DF
DF02_cmp.drop_duplicates(inplace = True)
print DF02_cmp.shape
DF02_skp.drop_duplicates(inplace = True)
print DF02_skp.shape

(652133, 7)
(652133, 5)
(652133, 5)
(194582, 5)
(28148, 5)

/Users/liang/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

# re-assign dates with incorrect format as 'InputNAN'
# REASON: some of the dates were in format like May ***, and I decided to remove them.
# Fotunately, there are only in data before June
DF02_cmp['property_lastHabitCompletedDate'] = DF02_cmp['property_lastHabitCompletedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
DF02_skp['property_lastHabitSkippedDate'] = DF02_skp['property_lastHabitSkippedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
print DF02_cmp.shape
print DF02_skp.shape

# drop rows with habit name and dates = 'InputNAN'
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompleted'] != 'InputNAN']
print DF02_cmp.shape
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompletedDate'] != 'InputNAN']
print DF02_cmp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkipped'] != 'InputNAN']
print DF02_skp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkippedDate'] != 'InputNAN']
print DF02_skp.shape

(194582, 5)
(28148, 5)
(181991, 5)
(181991, 5)
(14007, 5)
(14007, 5)

# add a column for day of week, 0-6 denotes Mon-Sun
DF02_skp['Day_of_Week'] = pd.to_datetime(DF02_skp['property_lastHabitSkippedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())

DF02_cmp['Day_of_Week'] = pd.to_datetime(DF02_cmp['property_lastHabitCompletedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())

premade_hbt = ['Yoga', 'Eat a Great Breakfast', 'Write in my Journal',
               'Eat More Fruit & Vegetables', 'Groom Myself', 'Be Grateful',
               'Disconnect & Create', 'Power Nap', 'Sit & Think!', 'Read',
               'Darker, Quieter, Cooler', 'Write my To Do', 'Learn & Study',
               'Floss', 'Meaningful & Focused Work', 'Meditate',
               'What are my most important 3 tasks?', 'Adjust & Review Plans',
               'Schedule in Time Slots', 'Walk', 'Weigh myself', 'Block Distractions',
               'Shower', 'Celebrate!', 'Take Medicine', 'I feel Great Today!',
               'Clean & Tidy up', 'Eat Whole Grain', 'Drink Tea', 'Call Mother & Father',
               'Morning Pages', 'Eat Fish and Seafood', 'Take Vitamins','Drink Water',
               'Exercise', 'Stretch', 'Get Inspired', 'Work on a secret project',
               'Reach to Friends', 'Study']

# select all then inputs with premade habits
DF02_cmp_premd = DF02_cmp[DF02_cmp['property_lastHabitCompleted'].isin(premade_hbt)]
DF02_skp_premd = DF02_skp[DF02_skp['property_lastHabitSkipped'].isin(premade_hbt)]
print DF02_cmp_premd.shape
print DF02_skp_premd.shape

(155880, 6)
(11942, 6)

# I removed the dates here, may need to add them back
# I also added a new column called "Action", 0 = complete
DF02_cmp_premd = DF02_cmp_premd.drop(['property_lastHabitCompletedDate'], axis = 1)
DF02_cmp_premd.rename(columns={'property_lastHabitCompleted':'Habit',
                              'property_userGoal':'Goal',
                              'property_gender':'Gender'},inplace=True)
DF02_cmp_premd['Action'] = 0
DF02_cmp_premd.head()

# I removed the dates here, may need to add them back
# I also added a new column called "Action", 1 = skip
DF02_skp_premd = DF02_skp_premd.drop(['property_lastHabitSkippedDate'], axis = 1)
DF02_skp_premd.rename(columns={'property_lastHabitSkipped':'Habit',
                              'property_userGoal':'Goal',
                              'property_gender':'Gender'},inplace=True)
DF02_skp_premd['Action'] = 1
DF02_skp_premd.head()

print DF02_cmp_premd.shape
print DF02_skp_premd.shape
print len(DF02_cmp_premd) + len(DF02_skp_premd)

(155880, 6)
(11942, 6)
167822

# combine the skip and complete events
DF_cmp_skp = pd.concat([DF02_cmp_premd, DF02_skp_premd])
print DF_cmp_skp.shape

(167822, 6)

# save the cleaned data, remember to change the name.
import os.path
fname = 'FB_data_ML_with_uid_201506.csv'
if os.path.isfile(fname):
    print 'file "' + fname + '" already exists'
else:
    DF_cmp_skp.to_csv(fname)
    print 'file "' + fname + '" saved'

file "FB_data_ML_with_uid_201506.csv" saved

The following is a short script to combine two data together, for example, to combine 201503 and 201504 together to get 2015_34, or combine 2015_34 and 201505 to get 2015_35¶

DF1 = pd.read_csv('./ML/FB_data_ML_with_uid_2015_37.csv')
DF2 = pd.read_csv('./ML/FB_data_ML_with_uid_201508.csv')
DF1.drop('Unnamed: 0',1,inplace=True)
DF2.drop('Unnamed: 0',1,inplace=True)
print DF1.shape, DF2.shape

DF0 = pd.concat([DF1, DF2])
print DF0.shape

import os.path
fname = 'FB_data_ML_with_uid_2015_38.csv'
if os.path.isfile(fname):
    print 'file "' + fname + '" already exists'
else:
    DF0.to_csv(fname)
    print 'file "' + fname + '" saved'

	property_userId	Goal	Gender	Habit	Day_of_Week
0	b5e7d864-2504-4fe7-b51a-a85876d290f6	Energy	female	Eat a Great Breakfast	6
1	d2f9faea-72de-481b-9abc-3f76df1f546f	Energy	female	Call Mother & Father	4
4	6ca7dff5-8105-4c94-aed5-86cb967b5474	Energy	female	Eat a Great Breakfast	0
5	919808ee-5dbe-48fb-83f9-3d45498bdcdc	Weight	female	Drink Water	3
8	1004d1f9-8b21-408b-8781-cd2bb7dca617	Energy	female	Take Medicine	6

	property_userId	Goal	Gender	Habit	Day_of_Week	Action
1	d2f9faea-72de-481b-9abc-3f76df1f546f	Energy	female	Drink Tea	2	1
5	919808ee-5dbe-48fb-83f9-3d45498bdcdc	Weight	female	Morning Pages	1	1
11	5596e827-5ff0-4376-99e8-6631f2f57b8e	Energy	InputNAN	Clean & Tidy up	6	1
14	cf79683f-7b5c-442f-9c13-3a519f036fa9	Sleep	female	Floss	5	1
20	ef25e6b8-3015-422b-ac65-8b1daa8272ef	Energy	female	Drink Water	1	1