In this file, I tried to predict the future activies of one user, using his/her previous records.

In [1]:
import numpy as np
import pandas as pd

All data (from March to August) are loaded, so that I can find out the top users (those who have the most activities)

In [2]:
DF00 = pd.read_csv('./FB_data_ML_with_uid_2015_38.csv')
DF00.drop('Unnamed: 0',1,inplace=True)
print DF00.shape
DF00.head()
(857662, 6)
Out[2]:
property_userId Goal Gender Habit Day_of_Week Action
0 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
1 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
2 fabbc998-2a02-46b8-8442-45445096913b Energy male Meditate 0 0
3 fabbc998-2a02-46b8-8442-45445096913b Energy male Drink Water 0 0
4 fabbc998-2a02-46b8-8442-45445096913b Energy male Clean & Tidy up 0 0
In [3]:
# same mapping as before
Goal_dict = dict(zip(set(list(DF00['Goal'])), range(len(set(list(DF00['Goal']))))))
Gender_dict = dict(zip(set(list(DF00['Gender'])),range(len(set(list(DF00['Gender']))))))
Habit_dict = dict(zip(set(list(DF00['Habit'])),range(len(set(list(DF00['Habit']))))))
# print Goal_dict
# print Gender_dict
# print Habit_dict
DF00['Goal_int'] = DF00['Goal'].map(Goal_dict);
DF00['Gender_int'] = DF00['Gender'].map(Gender_dict);
DF00['Habit_int'] = DF00['Habit'].map(Habit_dict);

To count user's activites, and rank the users by their total activities

In [4]:
uid_count = pd.DataFrame(DF00.property_userId.value_counts())
uid_count.reset_index(inplace = True)
uid_count.columns = ['userID','num_appearance']
print uid_count.shape
print 'showing top 20 users here:'
top_users = uid_count.userID.head(20)
top_users
(24640, 2)
showing top 20 users here:
Out[4]:
0     c3b61e02-2f7c-40a8-a899-515edfbdcd10
1     be443e01-3ea4-4583-a592-01ea6129e8a0
2     15fb9a6d-e809-40f5-b406-5f625a2178c7
3     555a9ecf-2cc1-4efd-b13b-7ee3e7624a0b
4     16eeafdf-8518-48de-8edc-21e5b780a1eb
5     2569d7ff-dd25-4da3-9458-9fc5b93d0acb
6     47a480bf-98b1-4dcc-a843-0d377f171d29
7     d4e85466-36bb-4c8c-a981-671e1dbdcbbf
8     78fe4316-05f2-44f1-b472-4e7682c20bd2
9     40282081-1d37-4fd3-93cd-502a5d0a0c19
10    58c7498b-4f59-4730-8d8e-3334bb303733
11    5c52685d-4900-4b76-abf1-e58e531439d9
12    b24a468c-d672-4df0-9f20-348cc387aa17
13    22780ec5-d412-40a9-859b-889ce10ff490
14    ff3c22ed-c818-4d92-9dc8-343760f25119
15    b849f6f6-61e7-42eb-9d15-91d151cb88a9
16    1df1bdbc-fe9d-46fe-b0e0-fd6c8653aa79
17    0ce742d1-7b2b-428e-9546-264324f4db1e
18    6f152ed6-a514-4d21-b8c5-48970105683c
19    94356055-848f-4546-854f-8bdc839967f6
Name: userID, dtype: object

Before selecting one user, I tried to see the gender distribution for top users:

In [5]:
# to list the gender information for top 20 users
print 'For top 20 users:'
for n in top_users:
    DF_tmp = DF00[DF00['property_userId'] == n]
    print list(set(list(DF_tmp.Gender)))

# to list the gender information for top 100 users
print 'For top 100 users:'
top_user_gender = []
for n in uid_count.userID.head(100):
    DF_tmp = DF00[DF00['property_userId'] == n]
    top_user_gender = top_user_gender + list(set(list(DF_tmp.Gender)))

from collections import Counter
print Counter(top_user_gender)
For top 20 users:
['InputNAN']
['female']
['female']
['female']
['female']
['InputNAN']
['female']
['female']
['female']
['InputNAN']
['female']
['female']
['InputNAN']
['female']
['InputNAN']
['male']
['InputNAN']
['male']
['InputNAN']
['InputNAN']
For top 100 users:
Counter({'female': 53, 'InputNAN': 27, 'male': 19, 'other': 4})

Now, to pick one user, using the ranking number in front of the user ID (0 = the first)

Rank #6 user is selected here:

In [6]:
uid = top_users[5]
DF_user = DF00[DF00['property_userId'] == uid]
print 'current userID: ' + uid
f = open('result_' + uid + '.txt', 'w')
# DF_user.head()
current userID: 2569d7ff-dd25-4da3-9458-9fc5b93d0acb
In [7]:
print DF_user.shape
print set(list(DF_user.Action))
print set(list(DF_user.Gender))
print set(list(DF_user.Goal))
if len(set(list(DF_user.Action))) == 1:
    msg = 'this ' + list(DF_user['Gender'].head(1))[0] + ' user completed all habits.'
    print msg
    f.write(msg + '\n')
    print 'not need to build the model this time.'
else:
    msg = 'this is a ' + list(DF_user['Gender'].head(1))[0] + '(gender) user.'
    print msg
    f.write(msg + '\n')
    
f.close()
(1661, 9)
set([0, 1])
set(['InputNAN'])
set(['Energy'])
this is a InputNAN(gender) user.

The data were divided in to March-to-May, June, July, and August

In [8]:
print len(set(list(DF_user.Habit)))
DF_user_May = DF_user[DF_user.index <= 229378]
print DF_user_May.shape
print len(set(list(DF_user_May.Habit)))
DF_user_Jun = DF_user[(DF_user.index > 229378) & (DF_user.index <= 397200)]
print DF_user_Jun.shape
print len(set(list(DF_user_Jun.Habit)))
DF_user_Jul = DF_user[(DF_user.index > 397200) & (DF_user.index <= 585463)]
print DF_user_Jul.shape
print len(set(list(DF_user_Jul.Habit)))
DF_user_Aug = DF_user[(DF_user.index > 585463) & (DF_user.index <= 857662)]
print DF_user_Aug.shape
print len(set(list(DF_user_Aug.Habit)))
13
(705, 9)
12
(265, 9)
12
(367, 9)
12
(324, 9)
12

First, using March-to-May to predict June

In [9]:
# from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
In [10]:
X_train = DF_user_May[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May['Action']

X_test = DF_user_Jun[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Jun['Action']

classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print("print results summary for training dataset (May data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)

print("print results summary for testing dataset (Jun data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print results summary for training dataset (May data)
('accuracy:', 0.76170212765957446)
('precision:', 0.64210526315789473)
('recall:', 0.73493975903614461)
('f1 score:', 0.6853932584269663)
[[354 102]
 [ 66 183]]
print results summary for testing dataset (Jun data)
('accuracy:', 0.7245283018867924)
('precision:', 0.64655172413793105)
('recall:', 0.7009345794392523)
('f1 score:', 0.67264573991031384)
[[117  41]
 [ 32  75]]
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)

Next, using March-to-June to predict July

In [11]:
DF_user_May_Jun = pd.concat([DF_user_May, DF_user_Jun])

X_train = DF_user_May_Jun[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May_Jun['Action']

X_test = DF_user_Jul[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Jul['Action']

classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print("print results summary for training dataset (May to Jun data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)

print("print results summary for testing dataset (Jul data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print results summary for training dataset (May to Jun data)
('accuracy:', 0.77525773195876291)
('precision:', 0.69491525423728817)
('recall:', 0.6910112359550562)
('f1 score:', 0.6929577464788732)
[[506 108]
 [110 246]]
print results summary for testing dataset (Jul data)
('accuracy:', 0.70844686648501365)
('precision:', 0.44881889763779526)
('recall:', 0.6063829787234043)
('f1 score:', 0.51583710407239824)
[[203  70]
 [ 37  57]]
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)

Finally, using March-to-July to predict August

In [12]:
DF_user_May_Jul = pd.concat([DF_user_May_Jun, DF_user_Jul])

X_train = DF_user_May_Jul[['Goal_int','Habit_int', 'Day_of_Week']]
y_train = DF_user_May_Jul['Action']

X_test = DF_user_Aug[['Goal_int','Habit_int', 'Day_of_Week']]
y_test = DF_user_Aug['Action']

classifier = RandomForestClassifier(n_estimators = 80, max_features = 'sqrt', random_state=10,
                                    max_depth=None, min_samples_split=1, class_weight = 'auto', n_jobs=4)

clf = classifier.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_pred_train = clf.predict(X_train)

print("print results summary for training dataset (May to Jul data)")
print("accuracy:", metrics.accuracy_score(y_train, y_pred_train))
print("precision:", metrics.precision_score(y_train, y_pred_train, average='micro'))
print("recall:", metrics.recall_score(y_train, y_pred_train, average='micro'))
print("f1 score:", metrics.f1_score(y_train, y_pred_train, average='micro'))
cm = confusion_matrix(y_train, y_pred_train)
print(cm)

print("print results summary for testing dataset (Aug data)")
print("accuracy:", metrics.accuracy_score(y_test, y_pred))
print("precision:", metrics.precision_score(y_test, y_pred, average='micro'))
print("recall:", metrics.recall_score(y_test, y_pred, average='micro'))
print("f1 score:", metrics.f1_score(y_test, y_pred, average='micro'))
cm = confusion_matrix(y_test, y_pred)
print(cm)
print results summary for training dataset (May to Jul data)
('accuracy:', 0.75243081525804034)
('precision:', 0.60957642725598526)
('recall:', 0.73555555555555552)
('f1 score:', 0.66666666666666663)
[[675 212]
 [119 331]]
print results summary for testing dataset (Aug data)
('accuracy:', 0.66975308641975306)
('precision:', 0.35483870967741937)
('recall:', 0.61971830985915488)
('f1 score:', 0.45128205128205129)
[[173  80]
 [ 27  44]]
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)
/Users/liang/anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.py:931: DeprecationWarning: From version 0.18, binary input will not be handled specially when using averaged precision/recall/F-score. Please use average='binary' to report only the positive class performance.
  'positive class performance.', DeprecationWarning)