実践データ分析100本ノック(第3章-顧客の全体像を把握する-21~24)

2020年12月21日

実践データ分析100本ノック(第3章-顧客の全体像を把握する-21~24)


# %%
import pandas as pd
uselog = pd.read_csv('./samples/3/use_log.csv')
print(len(uselog))
uselog.head()
# 197428
# log_id	customer_id	usedate
# 0	L00000049012330	AS009373	2018-04-01
# 1	L00000049012331	AS015315	2018-04-01
# 2	L00000049012332	AS040841	2018-04-01
# 3	L00000049012333	AS046594	2018-04-01
# 4	L00000049012334	AS073285	2018-04-01
# %%
customer = pd.read_csv('./samples/3/customer_master.csv')
print(len(customer))
customer.head()
# 4192
# customer_id	name	class	gender	start_date	end_date	campaign_id	is_deleted
# 0	OA832399	XXXX	C01	F	2015-05-01 00:00:00	NaN	CA1	0
# 1	PL270116	XXXXX	C01	M	2015-05-01 00:00:00	NaN	CA1	0
# 2	OA974876	XXXXX	C01	M	2015-05-01 00:00:00	NaN	CA1	0
# 3	HD024127	XXXXX	C01	F	2015-05-01 00:00:00	NaN	CA1	0
# 4	HD661448	XXXXX	C03	F	2015-05-01 00:00:00	NaN	CA1	0
# %%
class_master = pd.read_csv('./samples/3/class_master.csv')
print(len(class_master))
class_master.head()
# 3
# class	class_name	price
# 0	C01	オールタイム	10500
# 1	C02	デイタイム 	7500
# 2	C03	ナイト	     6000
# %%
campaign_master = pd.read_csv('./samples/3/campaign_master.csv')
print(len(campaign_master))
campaign_master.head()
# 3
# campaign_id	campaign_name
# 0	CA1	通常
# 1	CA2	入会費半額
# 2	CA3	入会費無料
# %%
customer_join = pd.merge(customer, class_master, on="class", how="left")
customer_join = pd.merge(customer_join, campaign_master, on="campaign_id", how="left")
customer_join.head()
# customer_id	name	class	gender	start_date	end_date	campaign_id	is_deleted	class_name	price	campaign_name
# 0	OA832399	XXXX	C01	F	2015-05-01 00:00:00	NaN	CA1	0	オールタイム	10500	通常
# 1	PL270116	XXXXX	C01	M	2015-05-01 00:00:00	NaN	CA1	0	オールタイム	10500	通常
# 2	OA974876	XXXXX	C01	M	2015-05-01 00:00:00	NaN	CA1	0	オールタイム	10500	通常
# 3	HD024127	XXXXX	C01	F	2015-05-01 00:00:00	NaN	CA1	0	オールタイム	10500	通常
# 4	HD661448	XXXXX	C03	F	2015-05-01 00:00:00	NaN	CA1	0	ナイト	        6000	通常
# %%
customer_join.isnull().sum()
# customer_id         0
# name                0
# class               0
# gender              0
# start_date          0
# end_date         2842
# campaign_id         0
# is_deleted          0
# class_name          0
# price               0
# campaign_name       0
# dtype: int64
# %%
customer_join.groupby("class_name").count()["customer_id"]
# class_name
# オールタイム    2045
# デイタイム     1019
# ナイト       1128
# Name: customer_id, dtype: int64
# %%
customer_join.groupby("campaign_name").count()["customer_id"]
# campaign_name
# 入会費半額     650
# 入会費無料     492
# 通常       3050
# Name: customer_id, dtype: int64
# %%
customer_join.groupby("gender").count()["customer_id"]
# gender
# F    1983
# M    2209
# Name: customer_id, dtype: int64
# %%
customer_join.groupby("is_deleted").count()["customer_id"]
# is_deleted
# 0    2842
# 1    1350
# Name: customer_id, dtype: int64
# %%
# datetime型に変換
customer_join["start_date"] = pd.to_datetime(customer_join["start_date"])
# 2018年4月1日移行に入会した人数を確認
customer_start = customer_join.loc[customer_join["start_date"] > pd.to_datetime("20180401")] 
print(len(customer_start))
# 1361
# %%
# datetime型に変換
customer_join["end_date"] = pd.to_datetime(customer_join["end_date"])
# 2019年3月31日以降に退会したユーザー、もしくは在籍中のユーザーに絞る
customer_newer = customer_join.loc[(customer_join["end_date"] >= pd.to_datetime("20190331")) | (customer_join["end_date"].isna())]
# 人数を確認
print(len(customer_newer))
# 欠損値(退会していないユーザー)と2019年3月31日に退会したユーザーのみ該当しているか確認
customer_newer["end_date"].unique()
# 2953
# array(['NaT', '2019-03-31T00:00:00.000000000'],
# dtype='datetime64[ns]')
# %%
customer_newer.groupby("class_name").count()["customer_id"]
# class_name
# オールタイム    1444
# デイタイム      696
# ナイト        813
# Name: customer_id, dtype: int64
# %%
customer_newer.groupby("campaign_name").count()["customer_id"]
# campaign_name
# 入会費半額     311
# 入会費無料     242
# 通常       2400
# Name: customer_id, dtype: int64
# %%
customer_newer.groupby("gender").count()["customer_id"]
# gender
# F    1400
# M    1553
# Name: customer_id, dtype: int64
# %%

2020年12月21日