This file shows how I read and reformat the data for future use. And I'll refer the result as "clearned data" in the other instructions.

In [1]:
import pandas as pd
from datetime import datetime
In [2]:
DF00 = pd.read_csv('./data/2015-06-new.csv',low_memory=False)
In [3]:
# load useful column names, to subset df00
namelist = pd.read_csv('./column_names_for_ML.txt')
useful_column_names = namelist['col_name']
DF01 = DF00[useful_column_names]
print DF01.shape
DF01.head()
# as you can see, there are rows with all NaN
(652133, 7)
Out[3]:
property_userId property_userGoal property_gender property_lastHabitSkipped property_lastHabitSkippedDate property_lastHabitCompleted property_lastHabitCompletedDate
0 b5e7d864-2504-4fe7-b51a-a85876d290f6 Energy FEMALE NaN NaN Eat a Great Breakfast 2015-05-31T19:42:00
1 d2f9faea-72de-481b-9abc-3f76df1f546f Energy FEMALE Drink Tea 2015-05-20T23:00:27 Call Mother & Father 2015-05-22T00:00:14
2 879758c7-9b62-4e07-aea0-71a6cad20e98 NaN NaN NaN NaN NaN NaN
3 879758c7-9b62-4e07-aea0-71a6cad20e98 NaN NaN NaN NaN NaN NaN
4 6ca7dff5-8105-4c94-aed5-86cb967b5474 Energy FEMALE NaN NaN Eat a Great Breakfast 2015-05-18T00:13:03
In [4]:
# first, make all gender lowercase
DF01['property_gender'] = DF01['property_gender'].str.lower() # this line causes the warning msg below

# then re-assign NaN value as 'InputNAN' for futrue use, argumentable
DF02 = DF01.fillna(value='InputNAN')

# divide DF02 into two dfs, one for skipped hbt, one for completed hbt
DF02_skp = DF02.drop(['property_lastHabitCompleted','property_lastHabitCompletedDate'],axis = 1)
DF02_cmp = DF02.drop(['property_lastHabitSkipped','property_lastHabitSkippedDate'],axis = 1)
print DF02.shape
print DF02_cmp.shape
print DF02_skp.shape

# drop duplicated rows in each DF
DF02_cmp.drop_duplicates(inplace = True)
print DF02_cmp.shape
DF02_skp.drop_duplicates(inplace = True)
print DF02_skp.shape
(652133, 7)
(652133, 5)
(652133, 5)
(194582, 5)
(28148, 5)
/Users/liang/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
In [5]:
# re-assign dates with incorrect format as 'InputNAN'
# REASON: some of the dates were in format like May ***, and I decided to remove them.
# Fotunately, there are only in data before June
DF02_cmp['property_lastHabitCompletedDate'] = DF02_cmp['property_lastHabitCompletedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
DF02_skp['property_lastHabitSkippedDate'] = DF02_skp['property_lastHabitSkippedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
print DF02_cmp.shape
print DF02_skp.shape

# drop rows with habit name and dates = 'InputNAN'
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompleted'] != 'InputNAN']
print DF02_cmp.shape
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompletedDate'] != 'InputNAN']
print DF02_cmp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkipped'] != 'InputNAN']
print DF02_skp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkippedDate'] != 'InputNAN']
print DF02_skp.shape
(194582, 5)
(28148, 5)
(181991, 5)
(181991, 5)
(14007, 5)
(14007, 5)
In [6]:
# add a column for day of week, 0-6 denotes Mon-Sun
DF02_skp['Day_of_Week'] = pd.to_datetime(DF02_skp['property_lastHabitSkippedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())

DF02_cmp['Day_of_Week'] = pd.to_datetime(DF02_cmp['property_lastHabitCompletedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())
In [9]:
premade_hbt = ['Yoga', 'Eat a Great Breakfast', 'Write in my Journal',
               'Eat More Fruit & Vegetables', 'Groom Myself', 'Be Grateful',
               'Disconnect & Create', 'Power Nap', 'Sit & Think!', 'Read',
               'Darker, Quieter, Cooler', 'Write my To Do', 'Learn & Study',
               'Floss', 'Meaningful & Focused Work', 'Meditate',
               'What are my most important 3 tasks?', 'Adjust & Review Plans',
               'Schedule in Time Slots', 'Walk', 'Weigh myself', 'Block Distractions',
               'Shower', 'Celebrate!', 'Take Medicine', 'I feel Great Today!',
               'Clean & Tidy up', 'Eat Whole Grain', 'Drink Tea', 'Call Mother & Father',
               'Morning Pages', 'Eat Fish and Seafood', 'Take Vitamins','Drink Water',
               'Exercise', 'Stretch', 'Get Inspired', 'Work on a secret project',
               'Reach to Friends', 'Study']
In [10]:
# select all then inputs with premade habits
DF02_cmp_premd = DF02_cmp[DF02_cmp['property_lastHabitCompleted'].isin(premade_hbt)]
DF02_skp_premd = DF02_skp[DF02_skp['property_lastHabitSkipped'].isin(premade_hbt)]
print DF02_cmp_premd.shape
print DF02_skp_premd.shape
(155880, 6)
(11942, 6)
In [11]:
# I removed the dates here, may need to add them back
# I also added a new column called "Action", 0 = complete
DF02_cmp_premd = DF02_cmp_premd.drop(['property_lastHabitCompletedDate'], axis = 1)
DF02_cmp_premd.rename(columns={'property_lastHabitCompleted':'Habit',
                              'property_userGoal':'Goal',
                              'property_gender':'Gender'},inplace=True)
DF02_cmp_premd['Action'] = 0
DF02_cmp_premd.head()
Out[11]:
property_userId Goal Gender Habit Day_of_Week Action
0 b5e7d864-2504-4fe7-b51a-a85876d290f6 Energy female Eat a Great Breakfast 6 0
1 d2f9faea-72de-481b-9abc-3f76df1f546f Energy female Call Mother & Father 4 0
4 6ca7dff5-8105-4c94-aed5-86cb967b5474 Energy female Eat a Great Breakfast 0 0
5 919808ee-5dbe-48fb-83f9-3d45498bdcdc Weight female Drink Water 3 0
8 1004d1f9-8b21-408b-8781-cd2bb7dca617 Energy female Take Medicine 6 0
In [12]:
# I removed the dates here, may need to add them back
# I also added a new column called "Action", 1 = skip
DF02_skp_premd = DF02_skp_premd.drop(['property_lastHabitSkippedDate'], axis = 1)
DF02_skp_premd.rename(columns={'property_lastHabitSkipped':'Habit',
                              'property_userGoal':'Goal',
                              'property_gender':'Gender'},inplace=True)
DF02_skp_premd['Action'] = 1
DF02_skp_premd.head()
Out[12]:
property_userId Goal Gender Habit Day_of_Week Action
1 d2f9faea-72de-481b-9abc-3f76df1f546f Energy female Drink Tea 2 1
5 919808ee-5dbe-48fb-83f9-3d45498bdcdc Weight female Morning Pages 1 1
11 5596e827-5ff0-4376-99e8-6631f2f57b8e Energy InputNAN Clean & Tidy up 6 1
14 cf79683f-7b5c-442f-9c13-3a519f036fa9 Sleep female Floss 5 1
20 ef25e6b8-3015-422b-ac65-8b1daa8272ef Energy female Drink Water 1 1
In [13]:
print DF02_cmp_premd.shape
print DF02_skp_premd.shape
print len(DF02_cmp_premd) + len(DF02_skp_premd)
(155880, 6)
(11942, 6)
167822
In [14]:
# combine the skip and complete events
DF_cmp_skp = pd.concat([DF02_cmp_premd, DF02_skp_premd])
print DF_cmp_skp.shape
(167822, 6)
In [15]:
# save the cleaned data, remember to change the name.
import os.path
fname = 'FB_data_ML_with_uid_201506.csv'
if os.path.isfile(fname):
    print 'file "' + fname + '" already exists'
else:
    DF_cmp_skp.to_csv(fname)
    print 'file "' + fname + '" saved'
file "FB_data_ML_with_uid_201506.csv" saved

The following is a short script to combine two data together, for example, to combine 201503 and 201504 together to get 2015_34, or combine 2015_34 and 201505 to get 2015_35

In [ ]:
DF1 = pd.read_csv('./ML/FB_data_ML_with_uid_2015_37.csv')
DF2 = pd.read_csv('./ML/FB_data_ML_with_uid_201508.csv')
DF1.drop('Unnamed: 0',1,inplace=True)
DF2.drop('Unnamed: 0',1,inplace=True)
print DF1.shape, DF2.shape

DF0 = pd.concat([DF1, DF2])
print DF0.shape

import os.path
fname = 'FB_data_ML_with_uid_2015_38.csv'
if os.path.isfile(fname):
    print 'file "' + fname + '" already exists'
else:
    DF0.to_csv(fname)
    print 'file "' + fname + '" saved'