実践データ分析100本ノック(第3章-顧客の全体像を把握する-27~30)
2020年12月21日
実践データ分析100本ノック(第3章-顧客の全体像を把握する-27~30)
参考:生年月日から年齢を計算する
#--ノック27------------------------------
# %%
# 結合
customer_join = pd.merge(customer_join, uselog_customer, on="customer_id", how="left")
customer_join = pd.merge(customer_join, uselog_weekday[["customer_id", "routine_flg"]], on="customer_id", how="left")
print(customer_join.head())
# customer_id name class gender start_date end_date campaign_id is_deleted \
# 0 OA832399 XXXX C01 F 2015-05-01 NaT CA1 0
# 1 PL270116 XXXXX C01 M 2015-05-01 NaT CA1 0
# 2 OA974876 XXXXX C01 M 2015-05-01 NaT CA1 0
# 3 HD024127 XXXXX C01 F 2015-05-01 NaT CA1 0
# 4 HD661448 XXXXX C03 F 2015-05-01 NaT CA1 0
# class_name price campaign_name mean_x median_x max_x min_x mean_y \
# 0 オールタイム 10500 通常 4.833333 5.0 8 2 4.833333
# 1 オールタイム 10500 通常 5.083333 5.0 7 3 5.083333
# 2 オールタイム 10500 通常 4.583333 5.0 6 3 4.583333
# 3 オールタイム 10500 通常 4.833333 4.5 7 2 4.833333
# 4 ナイト 6000 通常 3.916667 4.0 6 1 3.916667
# median_y max_y min_y routine_flg
# 0 5.0 8 2 1
# 1 5.0 7 3 1
# 2 5.0 6 3 1
# 3 4.5 7 2 1
# 4 4.0 6 1 1
# %%
# 欠損値の確認
customer_join.isnull().sum()
# customer_id 0
# name 0
# class 0
# gender 0
# start_date 0
# end_date 2842
# campaign_id 0
# is_deleted 0
# class_name 0
# price 0
# campaign_name 0
# mean_x 0
# median_x 0
# max_x 0
# min_x 0
# mean_y 0
# median_y 0
# max_y 0
# min_y 0
# routine_flg 0
# dtype: int64
# %%
#--ノック28------------------------------
# %%
# 日付の比較を可能にするライブラリをインポート
from dateutil.relativedelta import relativedelta
# 別カラムを用意
customer_join["calc_date"] = customer_join["end_date"]
# カラム「calc_date」の欠損値があるデータを「20190430」で埋める
customer_join["calc_date"] = customer_join["calc_date"].fillna(pd.to_datetime("20190430"))
# 初期化
customer_join["membership_period"] = 0
for i in range(len(customer_join)):
# 退会年月日(第一引数) - 入会年月日(第二引数) = 会員期間
delta = relativedelta(customer_join["calc_date"].iloc[i], customer_join["start_date"].iloc[i])
# 年を月に変換
customer_join["membership_period"].iloc[i] = delta.years*12 + delta.months
print(customer_join.head())
# customer_id name class gender start_date end_date campaign_id is_deleted \
# 0 OA832399 XXXX C01 F 2015-05-01 NaT CA1 0
# 1 PL270116 XXXXX C01 M 2015-05-01 NaT CA1 0
# 2 OA974876 XXXXX C01 M 2015-05-01 NaT CA1 0
# 3 HD024127 XXXXX C01 F 2015-05-01 NaT CA1 0
# 4 HD661448 XXXXX C03 F 2015-05-01 NaT CA1 0
# class_name price ... median_x max_x min_x mean_y median_y max_y \
# 0 オールタイム 10500 ... 5.0 8 2 4.833333 5.0 8
# 1 オールタイム 10500 ... 5.0 7 3 5.083333 5.0 7
# 2 オールタイム 10500 ... 5.0 6 3 4.583333 5.0 6
# 3 オールタイム 10500 ... 4.5 7 2 4.833333 4.5 7
# 4 ナイト 6000 ... 4.0 6 1 3.916667 4.0 6
# min_y routine_flg calc_date membership_period
# 0 2 1 2019-04-30 47
# 1 3 1 2019-04-30 47
# 2 3 1 2019-04-30 47
# 3 2 1 2019-04-30 47
# 4 1 1 2019-04-30 47
#--ノック29------------------------------
# %%
des = customer_join[["mean", "median", "max", "min"]].describe()
print(des)
# mean median max min
# count 4192.000000 4192.000000 4192.000000 4192.000000
# mean 5.333127 5.250596 7.823950 3.041269
# std 1.777533 1.874874 2.168959 1.951565
# min 1.000000 1.000000 1.000000 1.000000
# 25% 4.250000 4.000000 7.000000 2.000000
# 50% 5.000000 5.000000 8.000000 3.000000
# 75% 6.416667 6.500000 9.000000 4.000000
# max 12.000000 12.000000 14.000000 12.000000
count = customer_join.groupby("routine_flg").count()["customer_id"]
print(count)
# routine_flg
# 0 779
# 1 3413
# Name: customer_id, dtype: int64
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(customer_join["membership_period"])
#--ノック30------------------------------
# %%
# 退会ユーザーを確認
customer_end = customer_join.loc[customer_join["is_deleted"]==1]
print(customer_end.describe())
# is_deleted price mean median max \
# count 1350.0 1350.000000 1350.000000 1350.000000 1350.000000
# mean 1.0 8595.555556 3.865474 3.621852 6.461481
# std 0.0 1949.163652 1.246385 1.270847 2.584021
# min 1.0 6000.000000 1.000000 1.000000 1.000000
# 25% 1.0 6000.000000 3.000000 3.000000 4.000000
# 50% 1.0 7500.000000 4.000000 4.000000 7.000000
# 75% 1.0 10500.000000 4.666667 4.500000 8.000000
# max 1.0 10500.000000 9.000000 9.000000 13.000000
# min routine_flg membership_period
# count 1350.000000 1350.000000 1350.000000
# mean 1.821481 0.456296 8.026667
# std 0.976361 0.498271 5.033692
# min 1.000000 0.000000 1.000000
# 25% 1.000000 0.000000 4.000000
# 50% 2.000000 0.000000 7.000000
# 75% 2.000000 1.000000 11.000000
# max 8.000000 1.000000 23.000000
# 継続ユーザーを確認
customer_stay = customer_join.loc[customer_join["is_deleted"]==0]
print(customer_stay.describe())
# is_deleted price mean median max \
# count 2842.0 2842.000000 2842.000000 2842.000000 2842.000000
# mean 0.0 8542.927516 6.030288 6.024279 8.471147
# std 0.0 1977.189779 1.553587 1.599765 1.571048
# min 0.0 6000.000000 3.166667 3.000000 5.000000
# 25% 0.0 6000.000000 4.833333 5.000000 7.000000
# 50% 0.0 7500.000000 5.583333 5.500000 8.000000
# 75% 0.0 10500.000000 7.178030 7.000000 10.000000
# max 0.0 10500.000000 12.000000 12.000000 14.000000
# min routine_flg membership_period
# count 2842.000000 2842.000000 2842.000000
# mean 3.620690 0.984166 23.970443
# std 2.030488 0.124855 13.746761
# min 1.000000 0.000000 1.000000
# 25% 2.000000 1.000000 12.000000
# 50% 3.000000 1.000000 24.000000
# 75% 5.000000 1.000000 35.000000
# max 12.000000 1.000000 47.000000
# CSV出力
customer_join.to_csv("customer_join.csv", index=False)
ディスカッション
コメント一覧
まだ、コメントがありません