import pandas as pd
from datetime import datetime
DF00 = pd.read_csv('./data/2015-06-new.csv',low_memory=False)
# load useful column names, to subset df00
namelist = pd.read_csv('./column_names_for_ML.txt')
useful_column_names = namelist['col_name']
DF01 = DF00[useful_column_names]
print DF01.shape
DF01.head()
# as you can see, there are rows with all NaN
# first, make all gender lowercase
DF01['property_gender'] = DF01['property_gender'].str.lower() # this line causes the warning msg below
# then re-assign NaN value as 'InputNAN' for futrue use, argumentable
DF02 = DF01.fillna(value='InputNAN')
# divide DF02 into two dfs, one for skipped hbt, one for completed hbt
DF02_skp = DF02.drop(['property_lastHabitCompleted','property_lastHabitCompletedDate'],axis = 1)
DF02_cmp = DF02.drop(['property_lastHabitSkipped','property_lastHabitSkippedDate'],axis = 1)
print DF02.shape
print DF02_cmp.shape
print DF02_skp.shape
# drop duplicated rows in each DF
DF02_cmp.drop_duplicates(inplace = True)
print DF02_cmp.shape
DF02_skp.drop_duplicates(inplace = True)
print DF02_skp.shape
# re-assign dates with incorrect format as 'InputNAN'
# REASON: some of the dates were in format like May ***, and I decided to remove them.
# Fotunately, there are only in data before June
DF02_cmp['property_lastHabitCompletedDate'] = DF02_cmp['property_lastHabitCompletedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
DF02_skp['property_lastHabitSkippedDate'] = DF02_skp['property_lastHabitSkippedDate'].apply(lambda x: 'InputNAN' if x[0] != '2' else x)
print DF02_cmp.shape
print DF02_skp.shape
# drop rows with habit name and dates = 'InputNAN'
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompleted'] != 'InputNAN']
print DF02_cmp.shape
DF02_cmp = DF02_cmp[DF02_cmp['property_lastHabitCompletedDate'] != 'InputNAN']
print DF02_cmp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkipped'] != 'InputNAN']
print DF02_skp.shape
DF02_skp = DF02_skp[DF02_skp['property_lastHabitSkippedDate'] != 'InputNAN']
print DF02_skp.shape
# add a column for day of week, 0-6 denotes Mon-Sun
DF02_skp['Day_of_Week'] = pd.to_datetime(DF02_skp['property_lastHabitSkippedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())
DF02_cmp['Day_of_Week'] = pd.to_datetime(DF02_cmp['property_lastHabitCompletedDate'], format = '%Y-%m-%dT%H:%M:%S').\
apply(lambda x: x.weekday())
premade_hbt = ['Yoga', 'Eat a Great Breakfast', 'Write in my Journal',
'Eat More Fruit & Vegetables', 'Groom Myself', 'Be Grateful',
'Disconnect & Create', 'Power Nap', 'Sit & Think!', 'Read',
'Darker, Quieter, Cooler', 'Write my To Do', 'Learn & Study',
'Floss', 'Meaningful & Focused Work', 'Meditate',
'What are my most important 3 tasks?', 'Adjust & Review Plans',
'Schedule in Time Slots', 'Walk', 'Weigh myself', 'Block Distractions',
'Shower', 'Celebrate!', 'Take Medicine', 'I feel Great Today!',
'Clean & Tidy up', 'Eat Whole Grain', 'Drink Tea', 'Call Mother & Father',
'Morning Pages', 'Eat Fish and Seafood', 'Take Vitamins','Drink Water',
'Exercise', 'Stretch', 'Get Inspired', 'Work on a secret project',
'Reach to Friends', 'Study']
# select all then inputs with premade habits
DF02_cmp_premd = DF02_cmp[DF02_cmp['property_lastHabitCompleted'].isin(premade_hbt)]
DF02_skp_premd = DF02_skp[DF02_skp['property_lastHabitSkipped'].isin(premade_hbt)]
print DF02_cmp_premd.shape
print DF02_skp_premd.shape
# I removed the dates here, may need to add them back
# I also added a new column called "Action", 0 = complete
DF02_cmp_premd = DF02_cmp_premd.drop(['property_lastHabitCompletedDate'], axis = 1)
DF02_cmp_premd.rename(columns={'property_lastHabitCompleted':'Habit',
'property_userGoal':'Goal',
'property_gender':'Gender'},inplace=True)
DF02_cmp_premd['Action'] = 0
DF02_cmp_premd.head()
# I removed the dates here, may need to add them back
# I also added a new column called "Action", 1 = skip
DF02_skp_premd = DF02_skp_premd.drop(['property_lastHabitSkippedDate'], axis = 1)
DF02_skp_premd.rename(columns={'property_lastHabitSkipped':'Habit',
'property_userGoal':'Goal',
'property_gender':'Gender'},inplace=True)
DF02_skp_premd['Action'] = 1
DF02_skp_premd.head()
print DF02_cmp_premd.shape
print DF02_skp_premd.shape
print len(DF02_cmp_premd) + len(DF02_skp_premd)
# combine the skip and complete events
DF_cmp_skp = pd.concat([DF02_cmp_premd, DF02_skp_premd])
print DF_cmp_skp.shape
# save the cleaned data, remember to change the name.
import os.path
fname = 'FB_data_ML_with_uid_201506.csv'
if os.path.isfile(fname):
print 'file "' + fname + '" already exists'
else:
DF_cmp_skp.to_csv(fname)
print 'file "' + fname + '" saved'
DF1 = pd.read_csv('./ML/FB_data_ML_with_uid_2015_37.csv')
DF2 = pd.read_csv('./ML/FB_data_ML_with_uid_201508.csv')
DF1.drop('Unnamed: 0',1,inplace=True)
DF2.drop('Unnamed: 0',1,inplace=True)
print DF1.shape, DF2.shape
DF0 = pd.concat([DF1, DF2])
print DF0.shape
import os.path
fname = 'FB_data_ML_with_uid_2015_38.csv'
if os.path.isfile(fname):
print 'file "' + fname + '" already exists'
else:
DF0.to_csv(fname)
print 'file "' + fname + '" saved'