import pandas as pd
import numpy as np
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_35.csv')
DF01 = pd.read_csv('./FB_data_ML_with_uid_201506.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF01.drop('Unnamed: 0',1,inplace=True)
print DF01.shape
# This the same mapping I did in Part01 and Part02
# uid_dict = dict(zip(set(list(DF00['property_userId'])), range(len(set(list(DF00['property_userId']))))))
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
print Goal_dict
print Gender_dict
print Habit_dict
# DF00['UID_int'] = DF00['property_userId'].map(uid_dict);
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
DF01['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF01['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF01['Habit_int'] = DF00['Habit'].map(Habit_dict);
DF00.head()
DF01.head()
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
# from sklearn import pipeline
# X = DF00[['UID_int','Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
X_train = DF00[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_train = DF00['Action']
X_test = DF01[['Gender_int','Goal_int', 'Habit_int', 'Day_of_Week']]
y_test = DF01['Action']
classifier = RandomForestClassifier(n_estimators = 40, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto')
clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn import metrics
y_pred_train = clf.predict(X_train)
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, y_pred_train)
print(cm)
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
# from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)