実践データ分析100本ノック(第4章-顧客の行動を予測する-35～36)

2020年10月3日2020年12月21日
実践データ分析100本ノック(第4章-顧客の行動を予測する-35～36)

コード


#--ノック35------------------------------
# %%
print(customer_clustering.head())
#    月内平均値  月内中央値  月内最大値  月内最小値  会員期間  cluster
# 1  5.083333     5.0         7           3           47        3
# 0  4.833333     5.0         8           2           47        3
# 2  4.583333     5.0         6           3           47        3
# 3  4.833333     4.5         7           2           47        3
# 4  3.916667     4.0         6           1           47        3

print(customer.head())
#   customer_id   name class gender  start_date end_date campaign_id  \
# 0    OA832399   XXXX   C01      F  2015-05-01      NaN         CA1   
# 1    PL270116  XXXXX   C01      M  2015-05-01      NaN         CA1   
# 2    OA974876  XXXXX   C01      M  2015-05-01      NaN         CA1   
# 3    HD024127  XXXXX   C01      F  2015-05-01      NaN         CA1   
# 4    HD661448  XXXXX   C03      F  2015-05-01      NaN         CA1   

#    is_deleted class_name  price campaign_name      mean  median  max  min  \
# 0           0     オールタイム  10500            通常  4.833333     5.0    8    2   
# 1           0     オールタイム  10500            通常  5.083333     5.0    7    3   
# 2           0     オールタイム  10500            通常  4.583333     5.0    6    3   
# 3           0     オールタイム  10500            通常  4.833333     4.5    7    2   
# 4           0        ナイト   6000            通常  3.916667     4.0    6    1   

#    routine_flg   calc_date  membership_period  
# 0            1  2019-04-30                 47  
# 1            1  2019-04-30                 47  
# 2            1  2019-04-30                 47  
# 3            1  2019-04-30                 47  
# 4            1  2019-04-30                 47

# データフレームを結合(axis=1で横に結合)
customer_clustering = pd.concat([customer_clustering, customer], axis=1)
# クラスタ、退会フラグ、顧客毎に集計
customer_clustering.groupby(["cluster", "is_deleted"], as_index=False).count()[["cluster", "is_deleted", "customer_id"]]
# クラスタ、定期利用フラグ、顧客毎に集計
customer_clustering.groupby(["cluster", "routine_flg"], as_index=False).count()[["cluster", "routine_flg", "customer_id"]]

#--ノック36------------------------------
# %%
# datetime型に変換
uselog["usedate"] = pd.to_datetime(uselog["usedate"])
# フォーマットを変換してカラム「年月」を生成
uselog["年月"] = uselog["usedate"].dt.strftime("%Y%m")
# カラム「年月」と「ユーザー」毎にグループ化してカウント
uselog_months = uselog.groupby(["年月", "customer_id"], as_index=False).count()
# カラム名「log_id」を「count」に変換
uselog_months.rename(columns={"log_id": "count"}, inplace=True)
#カラム[usedate]を削除
del uselog_months["usedate"]
uselog_months.head()
# 	年月	customer_id	count
# 0	201804	AS002855	4
# 1	201804	AS009013	2
# 2	201804	AS009373	3
# 3	201804	AS015315	6
# 4	201804	AS015739	7

# 2018年5月～2018年10月の利用回数データと2018年11月の利用回数を教師データとして学習
year_months = list(uselog_months["年月"].unique())
print(year_months)
# ['201804', '201805', '201806', '201807', '201808', '201809', '201810', '201811', '201812', '201901', '201902', '201903']
predict_data = pd.DataFrame()
for i in range(6, len(year_months)):
    # 6, 7, 8, 9, 10, 11
    tmp = uselog_months.loc[uselog_months["年月"]==year_months[i]]
    # print(tmp)
    # 6
    #         年月    customer_id  count
    # 18532  201810    AS002855      3
    # 18533  201810    AS008805      2
    # 18534  201810    AS009373      5
    # 18535  201810    AS015233      7
    # 18536  201810    AS015315      4
    # ...       ...         ...    ...
    tmp.rename(columns={"count": "count_pred"}, inplace=True)
    # 該当年月より過去6カ月を回す
    for j in range(1, 7):
        tmp_before = uselog_months.loc[uselog_months["年月"]==year_months[i-j]]
        del tmp_before["年月"]
        # カラム名「count」を「count_x」に変換
        tmp_before.rename(columns={"count": "count_{}".format(j-1)}, inplace=True)
        # 該当年月のデータに前6カ月(count_0～count_5)を結合していく
        tmp = pd.merge(tmp, tmp_before, on="customer_id", how="left")
        #         年月 customer_id  count_pred  count_0  count_1  count_2  count_3
        # 0     201812    AS002855           2      5.0      3.0      7.0      3.0...
        # 1     201812    AS008805           5      3.0      2.0      2.0      5.0...
        # 2     201812    AS009373           4      4.0      5.0      6.0      6.0...
        # 3     201812    AS015233           9      7.0      7.0      9.0     11.0...
        # 4     201812    AS015315           5      7.0      4.0      7.0      3.0...
        #...      ...         ...         ...      ...      ...      ...      ...
    # 上記のデータをひとまとめにする為結合していく
    predict_data = pd.concat([predict_data, tmp], ignore_index=True)
print(predict_data.head())
#        年月 customer_id  count_pred  count_0  count_1  count_2  count_3  \
# 0  201810    AS002855           3      7.0      3.0      5.0      5.0   
# 1  201810    AS008805           2      2.0      5.0      7.0      8.0   
# 2  201810    AS009373           5      6.0      6.0      7.0      4.0   
# 3  201810    AS015233           7      9.0     11.0      5.0      7.0   
# 4  201810    AS015315           4      7.0      3.0      6.0      3.0   

#    count_4  count_5  
# 0      5.0      4.0  
# 1      NaN      NaN  
# 2      4.0      3.0  
# 3      7.0      NaN  
# 4      3.0      6.0  

# 滞在期間6カ月以上の顧客を除去（NaNの行を除去）
predict_data = predict_data.dropna()
predict_data = predict_data.reset_index(drop=True)