For the random forest model, best parameters are: {'max_features': 'sqrt', 'n_estimators': (depends on data sets)}
import pandas as pd
import numpy as np
# load March to May data (cleaned data)
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF00.head()
# This the same mapping I did in Part01
# uid_dict = dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict
# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
DF00.head()
For March to May data, n_estimators = 40.
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
# If included user ID as a predictor, the model will be overfitting
# X = DF00[['UID_int','Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
X = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y = DF00['Action']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2015)
classifier = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)
clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn import metrics
y_pred_train = clf.predict(X_train)
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
# calulate the confusion_matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred_train)
print(cm)
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
# calulate the confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
from scipy import interp
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import StratifiedKFold
import time
start_time = time.time()
X = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y = DF00['Action']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2015)
cv = StratifiedKFold(y, n_folds=4)
classifier = RandomForestClassifier(n_estimators = 100, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto')
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
plt.figure(figsize=(10,6))
for i, (train, test) in enumerate(cv):
# classifier.fit(X_train, y_train)
probas = classifier.fit(X_train, y_train).predict_proba(X_test)
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y_test, probas[:, 1])
# fpr, tpr, thresholds = roc_curve(y_test,clf.predict_proba(X_test)[:,1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
mean_tpr /= len(cv)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.rcParams.update({'font.size': 10})
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
print("--- %s seconds ---" % (time.time() - start_time))