import numpy as np
import pandas as pd
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_38.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF00.head()
# same mapping as before
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
# print Goal_dict
# print Gender_dict
# print Habit_dict
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);
uid_count = pd.DataFrame(DF00.property_userId.value_counts())
uid_count.reset_index(inplace = True)
uid_count.columns = ['userID','num_appearance']
print uid_count.shape
print 'showing top 20 users here:'
top_users = uid_count.userID.head(20)
top_users
# to list the gender information for top 20 users
print 'For top 20 users:'
for n in top_users:
DF_tmp = DF00[DF00['property_userId'] == n]
print list(set(list(DF_tmp.Gender)))
# to list the gender information for top 100 users
print 'For top 100 users:'
top_user_gender = []
for n in uid_count.userID.head(100):
DF_tmp = DF00[DF00['property_userId'] == n]
top_user_gender = top_user_gender + list(set(list(DF_tmp.Gender)))
from collections import Counter
print Counter(top_user_gender)
Rank #6 user is selected here:
uid = top_users[5]
DF_user = DF00[DF00['property_userId'] == uid]
print 'current userID: ' + uid
f = open('result_' + uid + '.txt', 'w')
# DF_user.head()
print DF_user.shape
print set(list(DF_user.Action))
print set(list(DF_user.Gender))
print set(list(DF_user.Goal))
if len(set(list(DF_user.Action))) == 1:
msg = 'this ' + list(DF_user['Gender'].head(1))[0] + ' user completed all habits.'
print msg
f.write(msg + '\n')
print 'not need to build the model this time.'
else:
msg = 'this is a ' + list(DF_user['Gender'].head(1))[0] + '(gender) user.'
print msg
f.write(msg + '\n')
f.close()
print len(set(list(DF_user.Habit)))
DF_user_May = DF_user[DF_user.index <= 229378]
print DF_user_May.shape
print len(set(list(DF_user_May.Habit)))
DF_user_Jun = DF_user[(DF_user.index > 229378) & (DF_user.index <= 397200)]
print DF_user_Jun.shape
print len(set(list(DF_user_Jun.Habit)))
DF_user_Jul = DF_user[(DF_user.index > 397200) & (DF_user.index <= 585463)]
print DF_user_Jul.shape
print len(set(list(DF_user_Jul.Habit)))
DF_user_Aug = DF_user[(DF_user.index > 585463) & (DF_user.index <= 857662)]
print DF_user_Aug.shape
print len(set(list(DF_user_Aug.Habit)))
# from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
X_train = DF_user_May[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May['Action']
X_test = DF_user_Jun[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Jun['Action']
classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)
clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print("print results summary for training dataset (May data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)
print("print results summary for testing dataset (Jun data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)
DF_user_May_Jun = pd.concat([DF_user_May, DF_user_Jun])
X_train = DF_user_May_Jun[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May_Jun['Action']
X_test = DF_user_Jul[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Jul['Action']
classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)
clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print("print results summary for training dataset (May to Jun data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)
print("print results summary for testing dataset (Jul data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)
DF_user_May_Jul = pd.concat([DF_user_May_Jun, DF_user_Jul])
X_train = DF_user_May_Jul[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May_Jul['Action']
X_test = DF_user_Aug[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Aug['Action']
classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)
clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print("print results summary for training dataset (May to Jul data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)
print("print results summary for testing dataset (Aug data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)