実践データ分析100本ノック(第3章-顧客の全体像を把握する-27~30)

2020年12月21日

実践データ分析100本ノック(第3章-顧客の全体像を把握する-27~30)

参考:生年月日から年齢を計算する


#--ノック27------------------------------
# %%
# 結合
customer_join = pd.merge(customer_join, uselog_customer, on="customer_id", how="left")
customer_join = pd.merge(customer_join, uselog_weekday[["customer_id", "routine_flg"]], on="customer_id", how="left")
print(customer_join.head())
#   customer_id   name class gender start_date end_date campaign_id  is_deleted  \
# 0    OA832399   XXXX   C01      F 2015-05-01      NaT         CA1           0   
# 1    PL270116  XXXXX   C01      M 2015-05-01      NaT         CA1           0   
# 2    OA974876  XXXXX   C01      M 2015-05-01      NaT         CA1           0   
# 3    HD024127  XXXXX   C01      F 2015-05-01      NaT         CA1           0   
# 4    HD661448  XXXXX   C03      F 2015-05-01      NaT         CA1           0   

#   class_name  price campaign_name    mean_x  median_x  max_x  min_x    mean_y  \
# 0     オールタイム  10500            通常  4.833333       5.0      8      2  4.833333   
# 1     オールタイム  10500            通常  5.083333       5.0      7      3  5.083333   
# 2     オールタイム  10500            通常  4.583333       5.0      6      3  4.583333   
# 3     オールタイム  10500            通常  4.833333       4.5      7      2  4.833333   
# 4        ナイト   6000            通常  3.916667       4.0      6      1  3.916667   

#    median_y  max_y  min_y  routine_flg  
# 0       5.0      8      2            1  
# 1       5.0      7      3            1  
# 2       5.0      6      3            1  
# 3       4.5      7      2            1  
# 4       4.0      6      1            1 
# %%

# 欠損値の確認
customer_join.isnull().sum()
# customer_id         0
# name                0
# class               0
# gender              0
# start_date          0
# end_date         2842
# campaign_id         0
# is_deleted          0
# class_name          0
# price               0
# campaign_name       0
# mean_x              0
# median_x            0
# max_x               0
# min_x               0
# mean_y              0
# median_y            0
# max_y               0
# min_y               0
# routine_flg         0
# dtype: int64
# %%

#--ノック28------------------------------
# %%
# 日付の比較を可能にするライブラリをインポート
from dateutil.relativedelta import relativedelta
# 別カラムを用意
customer_join["calc_date"] = customer_join["end_date"]
# カラム「calc_date」の欠損値があるデータを「20190430」で埋める
customer_join["calc_date"] = customer_join["calc_date"].fillna(pd.to_datetime("20190430"))
# 初期化
customer_join["membership_period"] = 0
for i in range(len(customer_join)):
    # 退会年月日(第一引数) - 入会年月日(第二引数) = 会員期間
    delta = relativedelta(customer_join["calc_date"].iloc[i], customer_join["start_date"].iloc[i])
    # 年を月に変換
    customer_join["membership_period"].iloc[i] = delta.years*12 + delta.months
print(customer_join.head())
#   customer_id   name class gender start_date end_date campaign_id  is_deleted  \
# 0    OA832399   XXXX   C01      F 2015-05-01      NaT         CA1           0   
# 1    PL270116  XXXXX   C01      M 2015-05-01      NaT         CA1           0   
# 2    OA974876  XXXXX   C01      M 2015-05-01      NaT         CA1           0   
# 3    HD024127  XXXXX   C01      F 2015-05-01      NaT         CA1           0   
# 4    HD661448  XXXXX   C03      F 2015-05-01      NaT         CA1           0   

#   class_name  price  ... median_x  max_x  min_x    mean_y  median_y  max_y  \
# 0     オールタイム  10500  ...      5.0      8      2  4.833333       5.0      8   
# 1     オールタイム  10500  ...      5.0      7      3  5.083333       5.0      7   
# 2     オールタイム  10500  ...      5.0      6      3  4.583333       5.0      6   
# 3     オールタイム  10500  ...      4.5      7      2  4.833333       4.5      7   
# 4        ナイト   6000  ...      4.0      6      1  3.916667       4.0      6   

#    min_y  routine_flg  calc_date  membership_period  
# 0      2            1 2019-04-30                 47  
# 1      3            1 2019-04-30                 47  
# 2      3            1 2019-04-30                 47  
# 3      2            1 2019-04-30                 47  
# 4      1            1 2019-04-30                 47  

#--ノック29------------------------------
# %%
des = customer_join[["mean", "median", "max", "min"]].describe()
print(des)
#               mean       median          max          min
# count  4192.000000  4192.000000  4192.000000  4192.000000
# mean      5.333127     5.250596     7.823950     3.041269
# std       1.777533     1.874874     2.168959     1.951565
# min       1.000000     1.000000     1.000000     1.000000
# 25%       4.250000     4.000000     7.000000     2.000000
# 50%       5.000000     5.000000     8.000000     3.000000
# 75%       6.416667     6.500000     9.000000     4.000000
# max      12.000000    12.000000    14.000000    12.000000

count = customer_join.groupby("routine_flg").count()["customer_id"]
print(count)
# routine_flg
# 0     779
# 1    3413
# Name: customer_id, dtype: int64

import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(customer_join["membership_period"])


#--ノック30------------------------------
# %%

# 退会ユーザーを確認
customer_end = customer_join.loc[customer_join["is_deleted"]==1]
print(customer_end.describe())
#        is_deleted         price         mean       median          max  \
# count      1350.0   1350.000000  1350.000000  1350.000000  1350.000000   
# mean          1.0   8595.555556     3.865474     3.621852     6.461481   
# std           0.0   1949.163652     1.246385     1.270847     2.584021   
# min           1.0   6000.000000     1.000000     1.000000     1.000000   
# 25%           1.0   6000.000000     3.000000     3.000000     4.000000   
# 50%           1.0   7500.000000     4.000000     4.000000     7.000000   
# 75%           1.0  10500.000000     4.666667     4.500000     8.000000   
# max           1.0  10500.000000     9.000000     9.000000    13.000000   

#                min  routine_flg  membership_period  
# count  1350.000000  1350.000000        1350.000000  
# mean      1.821481     0.456296           8.026667  
# std       0.976361     0.498271           5.033692  
# min       1.000000     0.000000           1.000000  
# 25%       1.000000     0.000000           4.000000  
# 50%       2.000000     0.000000           7.000000  
# 75%       2.000000     1.000000          11.000000  
# max       8.000000     1.000000          23.000000

# 継続ユーザーを確認
customer_stay = customer_join.loc[customer_join["is_deleted"]==0]
print(customer_stay.describe())
#        is_deleted         price         mean       median          max  \
# count      2842.0   2842.000000  2842.000000  2842.000000  2842.000000   
# mean          0.0   8542.927516     6.030288     6.024279     8.471147   
# std           0.0   1977.189779     1.553587     1.599765     1.571048   
# min           0.0   6000.000000     3.166667     3.000000     5.000000   
# 25%           0.0   6000.000000     4.833333     5.000000     7.000000   
# 50%           0.0   7500.000000     5.583333     5.500000     8.000000   
# 75%           0.0  10500.000000     7.178030     7.000000    10.000000   
# max           0.0  10500.000000    12.000000    12.000000    14.000000   

#                min  routine_flg  membership_period  
# count  2842.000000  2842.000000        2842.000000  
# mean      3.620690     0.984166          23.970443  
# std       2.030488     0.124855          13.746761  
# min       1.000000     0.000000           1.000000  
# 25%       2.000000     1.000000          12.000000  
# 50%       3.000000     1.000000          24.000000  
# 75%       5.000000     1.000000          35.000000  
# max      12.000000     1.000000          47.000000  

# CSV出力
customer_join.to_csv("customer_join.csv", index=False)

2020年12月21日