This file is to use previous data to predict next month¶

March to May data are used here to predict June activities¶

import pandas as pd
import numpy as np

DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF01 = pd.read_csv('./FB_data_ML_with_uid_201506.csv')

DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF01.drop('Unnamed: 0',1,inplace=True)
print DF01.shape

(229378, 6)
(167822, 6)

# This the same mapping I did in Part01 and Part02
# uid_dict =  dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict

# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
DF01['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF01['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF01['Habit_int'] = DF00['Habit'].map(Habit_dict);

{'old_user': 0, 'Weight': 1, 'Energy': 2, 'InputNAN': 3, 'Focus': 4, 'Sleep': 5}
{'InputNAN': 0, 'male': 1, 'other': 2, 'female': 3}
{'Write in my Journal': 0, 'Yoga': 1, 'Disconnect & Create': 2, 'Stretch': 3, 'Reach to Friends': 4, 'Morning Pages': 5, 'Floss': 6, 'Weigh myself': 7, 'Meditate': 8, 'Drink Water': 9, 'Get Inspired': 10, 'Exercise': 11, 'Groom Myself': 12, 'Power Nap': 13, 'Read': 14, 'Take Medicine': 15, 'Clean & Tidy up': 16, 'Eat a Great Breakfast': 17, 'Take Vitamins': 18, 'Eat More Fruit & Vegetables': 19, 'Study': 20, 'I feel Great Today!': 21, 'Celebrate!': 22, 'Shower': 23, 'Darker, Quieter, Cooler': 24, 'Be Grateful': 25, 'Call Mother & Father': 26, 'Walk': 27, 'Work on a secret project': 28, 'Drink Tea': 29}

DF00.head()

DF01.head()

Now I am predicting June activities using a model based onMarch to May data.¶

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
# from sklearn import pipeline

# X = DF00[['UID_int','Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
X_train = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_train = DF00['Action']

X_test = DF01[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_test = DF01['Action']

classifier = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto')

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Now, to show accuracy, precision, recall and F1 score for training set and testing set¶

For the training set (March to May):¶

from sklearn import metrics

y_pred_train = clf.predict(X_train)
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))

('accuracy:', 0.68908090575382119)
('precision:', 0.1218459406890304)
('recall:', 0.62867748144074787)

/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)

('f1 score:', 0.20412900345943533)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred_train)

print(cm)

[[148914  65916]
 [  5402   9146]]

For the prediction set (June):¶

print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))

('accuracy:', 0.65935932118554186)
('precision:', 0.072421291481516492)
('recall:', 0.32071679785630547)

/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)

('f1 score:', 0.11816064294198404)

# from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

[[106825  49055]
 [  8112   3830]]

	property_userId	Goal	Gender	Habit	Goal_int	Gender_int	Habit_int
0	fabbc998-2a02-46b8-8442-45445096913b	Energy	male	Drink Water	2	1	9
1	fabbc998-2a02-46b8-8442-45445096913b	Energy	male	Drink Water	2	1	9
2	fabbc998-2a02-46b8-8442-45445096913b	Energy	male	Meditate	2	1	8
3	fabbc998-2a02-46b8-8442-45445096913b	Energy	male	Drink Water	2	1	9
4	fabbc998-2a02-46b8-8442-45445096913b	Energy	male	Clean & Tidy up	2	1	16

	property_userId	Goal	Gender	Habit	Day_of_Week	Goal_int	Gender_int	Habit_int
0	b5e7d864-2504-4fe7-b51a-a85876d290f6	Energy	female	Eat a Great Breakfast	6	2	1	9
1	d2f9faea-72de-481b-9abc-3f76df1f546f	Energy	female	Call Mother & Father	4	2	1	9
2	6ca7dff5-8105-4c94-aed5-86cb967b5474	Energy	female	Eat a Great Breakfast	0	2	1	8
3	919808ee-5dbe-48fb-83f9-3d45498bdcdc	Weight	female	Drink Water	3	2	1	9
4	1004d1f9-8b21-408b-8781-cd2bb7dca617	Energy	female	Take Medicine	6	2	1	16