It turns out that "max_features" should always be "sqrt". (as expected for categorical data)
"n_estimators" is changing between data sets:
03 to 04 Data: n_estimators = 70;
03 to 05 Data: 40;
03 to 06 Data: 110;
03 to 07 Data: 80;
03 to 08 Data: 80;
import pandas as pd
import numpy as np
# load March to May data (cleaned data)
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF00.head()
# I am mapping categorical variable to integer numbers.
# The next commented out line is to map userID, but I will not use user ID as a predictor.
# uid_dict = dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict
# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
# This is how the new data frame looks like
DF00.head()
import time
start_time = time.time()
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
X = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y = DF00['Action']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2015)
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
# The following line is to estimate "n_estimators" and "max_features", and it will take a while, depends on data size.
tuned_parameters = [{'n_estimators':range(40,130,10), 'max_features': ['sqrt',None]}]
# if you don't want to estimate "max_features", feel free to use this line, instead the one above.
# tuned_parameters = [{'n_estimators':range(40,130,10)}]
scores = ['precision', 'recall']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
classifier = RandomForestClassifier(random_state=10, max_depth=None,
min_samples_split=1, class_weight = 'auto')
clf = GridSearchCV(classifier, tuned_parameters, cv = 5, n_jobs=4)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")
for params, mean_score, scores in clf.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
print("Detailed classification report:")
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print("--- %s seconds ---" % (time.time() - start_time))
### end of parameters estimation ###