This file is to use previous data to predict next month

March to May data are used here to predict June activities

In [1]:
import pandas as pd
import numpy as np
In [3]:
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF01 = pd.read_csv('./FB_data_ML_with_uid_201506.csv')

DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF01.drop('Unnamed: 0',1,inplace=True)
print DF01.shape
(229378, 6)
(167822, 6)
In [6]:
# This the same mapping I did in Part01 and Part02
# uid_dict =  dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict

# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
DF01['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF01['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF01['Habit_int'] = DF00['Habit'].map(Habit_dict);
{'old_user': 0, 'Weight': 1, 'Energy': 2, 'InputNAN': 3, 'Focus': 4, 'Sleep': 5}
{'InputNAN': 0, 'male': 1, 'other': 2, 'female': 3}
{'Write in my Journal': 0, 'Yoga': 1, 'Disconnect & Create': 2, 'Stretch': 3, 'Reach to Friends': 4, 'Morning Pages': 5, 'Floss': 6, 'Weigh myself': 7, 'Meditate': 8, 'Drink Water': 9, 'Get Inspired': 10, 'Exercise': 11, 'Groom Myself': 12, 'Power Nap': 13, 'Read': 14, 'Take Medicine': 15, 'Clean & Tidy up': 16, 'Eat a Great Breakfast': 17, 'Take Vitamins': 18, 'Eat More Fruit & Vegetables': 19, 'Study': 20, 'I feel Great Today!': 21, 'Celebrate!': 22, 'Shower': 23, 'Darker, Quieter, Cooler': 24, 'Be Grateful': 25, 'Call Mother & Father': 26, 'Walk': 27, 'Work on a secret project': 28, 'Drink Tea': 29}
In [7]:
DF00.head()
Out[7]:
property_userId Goal Gender Habit Day_of_Week Action Goal_int Gender_int Habit_int
0 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
1 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
2 fabbc998-2a02-46b8-8442-45445096913b Energy male Meditate 0 0 2 1 8
3 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
4 fabbc998-2a02-46b8-8442-45445096913b Energy male Clean & Tidy up 0 0 2 1 16
In [8]:
DF01.head()
Out[8]:
property_userId Goal Gender Habit Day_of_Week Action Goal_int Gender_int Habit_int
0 b5e7d864-2504-4fe7-b51a-a85876d290f6 Energy female Eat a Great Breakfast 6 0 2 1 9
1 d2f9faea-72de-481b-9abc-3f76df1f546f Energy female Call Mother & Father 4 0 2 1 9
2 6ca7dff5-8105-4c94-aed5-86cb967b5474 Energy female Eat a Great Breakfast 0 0 2 1 8
3 919808ee-5dbe-48fb-83f9-3d45498bdcdc Weight female Drink Water 3 0 2 1 9
4 1004d1f9-8b21-408b-8781-cd2bb7dca617 Energy female Take Medicine 6 0 2 1 16

Now I am predicting June activities using a model based onMarch to May data.

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
# from sklearn import pipeline

# X = DF00[['UID_int','Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
X_train = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_train = DF00['Action']

X_test = DF01[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_test = DF01['Action']

classifier = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto')

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Now, to show accuracy, precision, recall and F1 score for training set and testing set

For the training set (March to May):

In [11]:
from sklearn import metrics
In [12]:
y_pred_train = clf.predict(X_train)
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
('accuracy:', 0.68908090575382119)
('precision:', 0.1218459406890304)
('recall:', 0.62867748144074787)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
('f1 score:', 0.20412900345943533)
In [13]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred_train)

print(cm)
[[148914  65916]
 [  5402   9146]]

For the prediction set (June):

In [15]:
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
('accuracy:', 0.65935932118554186)
('precision:', 0.072421291481516492)
('recall:', 0.32071679785630547)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
('f1 score:', 0.11816064294198404)
In [16]:
# from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)
[[106825  49055]
 [  8112   3830]]