The main purpose of this file is to use the classifier built by RF_Part01, to make prediction and plot ROC curve

For the random forest model, best parameters are: {'max_features': 'sqrt', 'n_estimators': (depends on data sets)}

In [1]:
import pandas as pd
import numpy as np

As an example, I am using March to May data to predict June, so I need to train a model based on March to May data

In [2]:
# load March to May data (cleaned data)
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF00.head()
(229378, 6)
Out[2]:
property_userId Goal Gender Habit Day_of_Week Action
0 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
1 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
2 fabbc998-2a02-46b8-8442-45445096913b Energy male Meditate 0 0
3 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
4 fabbc998-2a02-46b8-8442-45445096913b Energy male Clean & Tidy up 0 0
In [3]:
# This the same mapping I did in Part01
# uid_dict =  dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict

# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);

DF00.head()
{'old_user': 0, 'Weight': 1, 'Energy': 2, 'InputNAN': 3, 'Focus': 4, 'Sleep': 5}
{'InputNAN': 0, 'male': 1, 'other': 2, 'female': 3}
{'Write in my Journal': 0, 'Yoga': 1, 'Disconnect & Create': 2, 'Stretch': 3, 'Reach to Friends': 4, 'Morning Pages': 5, 'Floss': 6, 'Weigh myself': 7, 'Meditate': 8, 'Drink Water': 9, 'Get Inspired': 10, 'Exercise': 11, 'Groom Myself': 12, 'Power Nap': 13, 'Read': 14, 'Take Medicine': 15, 'Clean & Tidy up': 16, 'Eat a Great Breakfast': 17, 'Take Vitamins': 18, 'Eat More Fruit & Vegetables': 19, 'Study': 20, 'I feel Great Today!': 21, 'Celebrate!': 22, 'Shower': 23, 'Darker, Quieter, Cooler': 24, 'Be Grateful': 25, 'Call Mother & Father': 26, 'Walk': 27, 'Work on a secret project': 28, 'Drink Tea': 29}
Out[3]:
property_userId Goal Gender Habit Day_of_Week Action Goal_int Gender_int Habit_int
0 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
1 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
2 fabbc998-2a02-46b8-8442-45445096913b Energy male Meditate 0 0 2 1 8
3 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0 2 1 9
4 fabbc998-2a02-46b8-8442-45445096913b Energy male Clean & Tidy up 0 0 2 1 16

This is where I train a prediction model.

For March to May data, n_estimators = 40.

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

# If included user ID as a predictor, the model will be overfitting
# X = DF00[['UID_int','Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
X = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y = DF00['Action']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2015)

classifier = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Now, to show accuracy, precision, recall and F1 score for training set and testing set

For the training set:

In [5]:
from sklearn import metrics
In [6]:
y_pred_train = clf.predict(X_train)
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
('accuracy:', 0.70597394185496121)
('precision:', 0.12727382607119114)
('recall:', 0.61910828025477704)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
('f1 score:', 0.21114193095612072)
In [7]:
# calulate the confusion_matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred_train)

print(cm)
[[107036  43323]
 [  3887   6318]]

For testing set:

In [8]:
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
('accuracy:', 0.69469875316069407)
('precision:', 0.10841165413533835)
('recall:', 0.53119963159106609)
('f1 score:', 0.18007259103149514)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
In [9]:
# calulate the confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)
[[45498 18973]
 [ 2036  2307]]

Now plotting ROC curve

In [10]:
from scipy import interp
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
In [11]:
import time
start_time = time.time()

X = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y = DF00['Action']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2015)

cv = StratifiedKFold(y, n_folds=4)
classifier = RandomForestClassifier(n_estimators = 100, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto')

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
plt.figure(figsize=(10,6))

for i, (train, test) in enumerate(cv):
    # classifier.fit(X_train, y_train)
    probas = classifier.fit(X_train, y_train).predict_proba(X_test)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1])
    # fpr, tpr, thresholds = roc_curve(y_test,clf.predict_proba(X_test)[:,1])
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
         label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.rcParams.update({'font.size': 10})
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")

print("--- %s seconds ---" % (time.time() - start_time))
--- 45.2408509254 seconds ---