BuildOurOwnRepublic blog rpblic

Search inside Blog:

    Jupyter Notebook HTML Template Test

    Tags:   JpterNb    Test    DataAnalysis    공모전   
    clustering_with_segmentation
    html 파일로 만들고 yaml 라벨을 붙여서 포스팅하자. 이 때 css 정보가 저장되어 있으니 html, body와 관련된 css 세팅을 지워야 정상적으로 나타난다.
    In [1]:
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib
    import matplotlib.pyplot as plt
    import pickle as pkl
    
    from scipy.stats import ttest_ind
    from sklearn.preprocessing import RobustScaler
    from sklearn.manifold import TSNE
    from sklearn.mixture import GaussianMixture
    
    font= matplotlib.font_manager.FontProperties(fname= 'C:\Windows\\Fonts\\조선일보명조.ttf').get_name()
    matplotlib.rc('font', family=font)  
    

    데이터를 로드하고 분석에 적합한 형태로 변환함

    In [2]:
    df_bymonth= pd.read_csv('./Pre_processed_data/groupby_ID_YEARMONTH.csv')
    
    In [3]:
    pt_bytsne= pd.read_pickle('./pkl_result/data_tSNE_dim2_perplex30.pkl')
    
    In [4]:
    df_raw= pd.read_csv('./Pre_processed_data/merged_dataframe_shopping.csv', delimiter= ',', encoding= 'euc-kr')
    
    In [5]:
    df_grouped= df_raw.sort_values('ID')
    
    In [6]:
    agg_func= {'RCT_NO': 'count', 'BIZ_UNIT': lambda x: x.nunique(), 'PD_S_C': lambda x: x.nunique(), \
               'BR_C': lambda x: x.nunique(), 'BUY_AM': 'sum'}
    
    In [7]:
    df_grouped= df_grouped.groupby('ID').agg(agg_func).reset_index(drop= False)
    
    In [8]:
    df_demo= df_raw[['ID', 'AGE_PRD', 'IS_MALE', 'IS_FEMALE', 'PRVIN']].drop_duplicates(subset= 'ID').sort_values('ID').reset_index(drop= True)
    
    In [9]:
    df_byitem= pd.read_pickle('./pkl_result/item_selected_by_middleline.pkl')
    
    In [10]:
    df_byitem= df_byitem.reset_index(drop=False)
    
    In [11]:
    df_item_consumed= pd.read_pickle('./pkl_result/item_selected_by_middleline.pkl')
    
    In [12]:
    df_item_consumed.reset_index(drop= False, inplace= True)
    
    In [13]:
    def plot_results(X, Y_= None):
        try:
            if Y_.any():
                for i in range(Y_.nunique()):
                    grp= sns.regplot(x= X.loc[Y_==i, 0], y= X.loc[Y_==i, 1], fit_reg= False, scatter_kws= {'s': 2})
            plt.show()
        except AttributeError:
            grp= sns.regplot(x= X.iloc[:,0], y= X.iloc[:, 1], fit_reg= False, scatter_kws= {'s': 2})
            plt.show()
    
    In [14]:
    def plot_clusters(X, Y_):
        for i in sorted(Y_.unique().tolist()):
            grp= sns.regplot(x= X.iloc[:,0], y= X.iloc[:, 1], fit_reg= False, scatter_kws= {'s': 2})
            grp= sns.regplot(x= X.loc[Y_==i, 0], y= X.loc[Y_==i, 1], fit_reg= False, scatter_kws= {"color": "red", 's': 2})
            plt.show()
    
    In [15]:
    prefered_item_dict_05= {}
    ft_dict= {}
    def find_prefered_item(df, label, p_val= 0.05):
        for i in sorted(label.unique().tolist()):
            ft_dict[i]= df.loc[label==i, :]
            print('\nfor cluster {}:\n'.format(str(i)))
            prefered_item_dict_05[i]= []
            for item in df.columns.values.tolist():
                (stat_item, p_val_item)=  ttest_ind(ft_dict[i][item], df[item], equal_var= False)
                if p_val_item< p_val and stat_item>0:
                    print('item {} has different mean with total dataset; ({}, {})'.format(item, stat_item, p_val_item))
                    prefered_item_dict_05[i].append(item)
                    
    def find_prefered_by_clusters(df, label, i, j, p_val= 0.05):
        for item in df.columns.values.tolist():
            (stat_item, p_val_item)=  ttest_ind(df.loc[label==i, item], df.loc[label==j, item], equal_var= False)
            if p_val_item< p_val:
                print('item {} has different mean with total dataset; ({}, {})'.format(item, stat_item, p_val_item))
    
    In [16]:
    plot_results(pt_bytsne[[0,1]], pt_bytsne['y_14_clst'])
    
    In [17]:
    plot_results(pt_bytsne[[0,1]])
    
    In [18]:
    plot_clusters(pt_bytsne[[0,1]], pt_bytsne['y_14_clst'])
    
    In [26]:
    gmm = GaussianMixture(n_components=14, covariance_type='full', random_state= 42).fit(pt_female[[0,1]])
    label_dict[(2,i)]= pd.Series(gmm.predict(pt_female[[0,1]]))
    plot_results(pt_female[[0,1]], label_dict[(2,14)])
    
    # 여성들만을 군집화한 경우에도 유사한 클러스터가 형성되는 것을 확인함
    
    In [28]:
    df_female= df_item_consumed.loc[df_demo['IS_FEMALE']==1, :].drop('ID', axis= 1).reset_index(drop= True)
    
    In [29]:
    df_female.head()
    
    Out[29]:
    H&B선물세트 VIDEOGAME 가공식품 가공우유 가구 가방브랜드 계절완구 고급 고양이용품 골프 ... 필기용품 한방차 한우선물세트 해초 헬스용품 호주산소고기 홍인삼 화과자 황태 훼이셜케어
    0 0.0 0.0 6.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    1 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0
    2 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 14.0 0.0 1.0 0.0 0.0
    3 0.0 0.0 18.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    4 0.0 0.0 23.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

    5 rows × 149 columns

    남성 사용자를 클러스터링해 보았음. 이 때 6번 클러스터와 7번 클러스터 간의 차이를 파악하였음.

    In [31]:
    pt_male= pt_bytsne.loc[df_demo['IS_FEMALE']==0, :].reset_index(drop= True)
    df_male= df_item_consumed.loc[df_demo['IS_FEMALE']==0, :].drop('ID', axis= 1).reset_index(drop= True)
    id_male= df_raw.loc[df_raw['IS_MALE']==1, 'ID'].drop_duplicates().sort_values().reset_index(drop= True)
    df_demo_male= df_demo.loc[df_demo['ID'].isin(id_male), :].reset_index(drop= True)
    df_male.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 7003 entries, 0 to 7002
    Columns: 149 entries, H&B선물세트 to 훼이셜케어
    dtypes: float64(149)
    memory usage: 8.0 MB
    
    In [32]:
    gmm = GaussianMixture(n_components=14, covariance_type='full', random_state= 42).fit(pt_male[[0,1]])
    label_dict[(2, 14)]= pd.Series(gmm.predict(pt_male[[0,1]]))
    plot_results(pt_male[[0,1]], label_dict[(2,14)])
    
    In [36]:
    df_byitem.loc[df_demo['IS_MALE']==1].reset_index().loc[label_dict[(2,14)]==6, :].head()
    
    Out[36]:
    index ID H&B선물세트 VIDEOGAME 가공식품 가공우유 가구 가방브랜드 계절완구 고급 ... 필기용품 한방차 한우선물세트 해초 헬스용품 호주산소고기 홍인삼 화과자 황태 훼이셜케어
    1 3 4 0.0 0.0 38.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    12 26 29 0.0 0.0 11.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 9.0 0.0 0.0 4.0 1.0
    19 39 42 0.0 0.0 22.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 0.0 0.0 3.0 0.0 0.0 0.0 1.0 1.0
    22 46 50 0.0 0.0 9.0 2.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 1.0 0.0 15.0 14.0 0.0 0.0 0.0
    26 52 56 0.0 0.0 13.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

    5 rows × 151 columns

    6번 클러스터와 7번 클러스터 간의 지표 차이를 파악하였음.

    In [37]:
    df_clst6= df_demo_male.loc[label_dict[(2,14)]==6, :]
    
    In [38]:
    df_clst7= df_demo_male.loc[label_dict[(2,14)]==7, :]
    
    In [39]:
    df_info6= df_grouped.loc[df_grouped['ID'].isin(df_clst6['ID']), :]
    df_info7= df_grouped.loc[df_grouped['ID'].isin(df_clst7['ID']), :]
    
    In [40]:
    df_info6.head()
    
    Out[40]:
    ID RCT_NO BIZ_UNIT PD_S_C BR_C BUY_AM
    3 4 308 3 136 7 7970116
    26 29 408 2 159 5 3756730
    39 42 492 2 138 3 5425186
    46 50 566 3 238 11 10626666
    52 56 107 3 58 6 2197496
    In [41]:
    df_clst6.AGE_PRD.value_counts().sort_index()
    
    Out[41]:
    20     55
    30    156
    40    249
    50    212
    60    129
    Name: AGE_PRD, dtype: int64
    In [42]:
    df_value6= df_clst6.PRVIN.value_counts().sort_values()
    df_value7= df_clst7.PRVIN.value_counts().sort_values()
    df_ratio6= (df_value6/df_clst6.PRVIN.count()).apply(lambda x: round(x, 3))
    df_ratio7= (df_value7/df_clst7.PRVIN.count()).apply(lambda x: round(x, 3))
    
    num_of_range= range(1, df_value6.count()+1)
    grp= plt.figure(figsize= (9,12))
    ax= grp.add_subplot(211)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio6, color='skyblue')
    for i in num_of_range:
        ax.annotate(df_ratio6.values.tolist()[i-1], xy=(df_ratio6.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio6, num_of_range, "o")
    plt.yticks(num_of_range, df_value6.index)
    
    num_of_range= range(1, df_value7.count()+1)
    grp= plt.figure(figsize= (9,9))
    ax= grp.add_subplot(212)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio7, color='tomato')
    for i in num_of_range:
        ax.annotate(df_ratio7.values.tolist()[i-1], xy=(df_ratio7.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio7, num_of_range, "o", color= 'red')
    plt.yticks(num_of_range, df_value6.index)
    plt.show()
    
    In [43]:
    df_value6= df_clst6.AGE_PRD.value_counts().sort_index()
    df_value7= df_clst7.AGE_PRD.value_counts().sort_index()
    df_ratio6= (df_value6/df_clst6.AGE_PRD.count()).apply(lambda x: round(x, 3))
    df_ratio7= (df_value7/df_clst7.AGE_PRD.count()).apply(lambda x: round(x, 3))
    
    num_of_range= range(1, df_value6.count()+1)
    grp= plt.figure(figsize= (9,12))
    ax= grp.add_subplot(211)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio6, color='skyblue')
    for i in num_of_range:
        ax.annotate(df_ratio6.values.tolist()[i-1], xy=(df_ratio6.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio6, num_of_range, "o")
    plt.yticks(num_of_range, df_value6.index)
    
    num_of_range= range(1, df_value7.count()+1)
    grp= plt.figure(figsize= (9,9))
    ax= grp.add_subplot(212)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio7, color='tomato')
    for i in num_of_range:
        ax.annotate(df_ratio7.values.tolist()[i-1], xy=(df_ratio7.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio7, num_of_range, "o", color= 'red')
    plt.yticks(num_of_range, df_value6.index)
    plt.show()
    
    In [44]:
    grp= sns.distplot(df_info6['BUY_AM'], label= 'cluster 6', bins= 30, kde= False, norm_hist= True)
    grp= sns.distplot(df_info7['BUY_AM'], label= 'cluster 7', kde= False, norm_hist= True)
    plt.legend()
    plt.show()
    
    In [45]:
    grp= sns.distplot(df_info6['RCT_NO'], label= 'cluster 6', bins= 30, kde= False, norm_hist= True)
    grp= sns.distplot(df_info7['RCT_NO'], label= 'cluster 7', kde= False, norm_hist= True)
    plt.legend()
    plt.show()
    
    In [46]:
    grp= sns.distplot(df_info6['PD_S_C'], label= 'cluster 6', bins= 30, kde= True)
    grp= sns.distplot(df_info7['PD_S_C'], label= 'cluster 7', kde= True)
    plt.legend()
    plt.show()
    
    In [47]:
    grp= sns.distplot(df_info6['BR_C'], label= 'cluster 6', bins= 30, kde= True)
    grp= sns.distplot(df_info7['BR_C'], label= 'cluster 7', kde= True)
    plt.legend()
    plt.show()
    

    20-40대 서울 거주 여성을 클러스터링함

    In [48]:
    pt_specified= pt_bytsne.loc[(df_demo['IS_FEMALE']==1)&((df_demo['AGE_PRD']==20)|(df_demo['AGE_PRD']==30)|(df_demo['AGE_PRD']==40))&(df_demo['PRVIN']=='서울특별시'), :].reset_index(drop= True)
    df_specified= df_item_consumed.loc[(df_demo['IS_FEMALE']==1)&((df_demo['AGE_PRD']==20)|(df_demo['AGE_PRD']==30)|(df_demo['AGE_PRD']==40))&(df_demo['PRVIN']=='서울특별시'), :].drop('ID', axis= 1).reset_index(drop= True)
    df_specified.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 2554 entries, 0 to 2553
    Columns: 149 entries, H&B선물세트 to 훼이셜케어
    dtypes: float64(149)
    memory usage: 2.9 MB
    
    In [49]:
    plot_results(pt_specified[[0,1]])
    
    In [51]:
    gmm = GaussianMixture(n_components=10, covariance_type='full', random_state= 42).fit(pt_specified[[0,1]])
    label_dict[(2,i)]= pd.Series(gmm.predict(pt_specified[[0,1]]))
    plot_results(pt_specified[[0,1]], label_dict[(2,10)])
    
    In [52]:
    plot_clusters(pt_specified[[0,1]], label_dict[(2,10)])