BuildOurOwnRepublic blog rpblic

Search inside Blog:

    Jupyter Notebook Test

    Tags:   JpterNb    Test    DataAnalysis    공모전   
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib
    import matplotlib.pyplot as plt
    import pickle as pkl
    
    from scipy.stats import ttest_ind
    from sklearn.preprocessing import RobustScaler
    from sklearn.manifold import TSNE
    from sklearn.mixture import GaussianMixture
    
    font= matplotlib.font_manager.FontProperties(fname= 'C:\Windows\\Fonts\\조선일보명조.ttf').get_name()
    matplotlib.rc('font', family=font)  
    

    데이터를 로드하고 분석에 적합한 형태로 변환함

    df_bymonth= pd.read_csv('./Pre_processed_data/groupby_ID_YEARMONTH.csv')
    
    pt_bytsne= pd.read_pickle('./pkl_result/data_tSNE_dim2_perplex30.pkl')
    
    df_raw= pd.read_csv('./Pre_processed_data/merged_dataframe_shopping.csv', delimiter= ',', encoding= 'euc-kr')
    
    df_grouped= df_raw.sort_values('ID')
    
    agg_func= {'RCT_NO': 'count', 'BIZ_UNIT': lambda x: x.nunique(), 'PD_S_C': lambda x: x.nunique(), \
               'BR_C': lambda x: x.nunique(), 'BUY_AM': 'sum'}
    
    df_grouped= df_grouped.groupby('ID').agg(agg_func).reset_index(drop= False)
    
    df_demo= df_raw[['ID', 'AGE_PRD', 'IS_MALE', 'IS_FEMALE', 'PRVIN']].drop_duplicates(subset= 'ID').sort_values('ID').reset_index(drop= True)
    
    df_byitem= pd.read_pickle('./pkl_result/item_selected_by_middleline.pkl')
    
    df_byitem= df_byitem.reset_index(drop=False)
    
    df_item_consumed= pd.read_pickle('./pkl_result/item_selected_by_middleline.pkl')
    
    df_item_consumed.reset_index(drop= False, inplace= True)
    
    def plot_results(X, Y_= None):
        try:
            if Y_.any():
                for i in range(Y_.nunique()):
                    grp= sns.regplot(x= X.loc[Y_==i, 0], y= X.loc[Y_==i, 1], fit_reg= False, scatter_kws= {'s': 2})
            plt.show()
        except AttributeError:
            grp= sns.regplot(x= X.iloc[:,0], y= X.iloc[:, 1], fit_reg= False, scatter_kws= {'s': 2})
            plt.show()
    
    def plot_clusters(X, Y_):
        for i in sorted(Y_.unique().tolist()):
            grp= sns.regplot(x= X.iloc[:,0], y= X.iloc[:, 1], fit_reg= False, scatter_kws= {'s': 2})
            grp= sns.regplot(x= X.loc[Y_==i, 0], y= X.loc[Y_==i, 1], fit_reg= False, scatter_kws= {"color": "red", 's': 2})
            plt.show()
    
    prefered_item_dict_05= {}
    ft_dict= {}
    def find_prefered_item(df, label, p_val= 0.05):
        for i in sorted(label.unique().tolist()):
            ft_dict[i]= df.loc[label==i, :]
            print('\nfor cluster {}:\n'.format(str(i)))
            prefered_item_dict_05[i]= []
            for item in df.columns.values.tolist():
                (stat_item, p_val_item)=  ttest_ind(ft_dict[i][item], df[item], equal_var= False)
                if p_val_item< p_val and stat_item>0:
                    print('item {} has different mean with total dataset; ({}, {})'.format(item, stat_item, p_val_item))
                    prefered_item_dict_05[i].append(item)
                    
    def find_prefered_by_clusters(df, label, i, j, p_val= 0.05):
        for item in df.columns.values.tolist():
            (stat_item, p_val_item)=  ttest_ind(df.loc[label==i, item], df.loc[label==j, item], equal_var= False)
            if p_val_item< p_val:
                print('item {} has different mean with total dataset; ({}, {})'.format(item, stat_item, p_val_item))
    
    plot_results(pt_bytsne[[0,1]], pt_bytsne['y_14_clst'])
    

    png

    plot_clusters(pt_bytsne[[0,1]], pt_bytsne['y_14_clst'])
    

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png

    plot_clusters(pt_bytsne[[0,1]], df_demo['IS_FEMALE'])
    #성별이 클러스터와 연관이 있는 인구통계 정보인지 확인함. 아래의 시각화 결과에 따르면, 성별은 클러스터를 구분짓는 요소는 아니지만 클러스터 내에서 소비 품목의 차이는 드러났음.
    

    png

    png

    plot_clusters(pt_bytsne[[0,1]], df_demo['AGE_PRD'])
    # 연령대의 경우도 마찬가지로 연령대와 관계 없이 폭넓게 분포하고 있었음.
    

    png

    png

    png

    png

    png

    성별, 연령대 또는 나이에 따라 사용자를 나누어 클러스터링해 보았음. 아래는 여성 사용자만을 대상으로 클러스터링한 결과임

    pt_bytsne= pd.concat([df_demo['ID'], pt_bytsne], axis= 1)
    
    pt_female= pt_bytsne.loc[df_demo['IS_FEMALE']==1, :].reset_index(drop= True)
    
    pt_female.head()
    
    ID 0 1 y_14_clst y_28_clst y_24_clst
    0 2 53.016911 -32.975819 10 8 8
    1 3 -22.183092 52.640892 3 9 9
    2 6 42.187847 32.573856 5 17 17
    3 7 -27.630230 52.580269 3 12 12
    4 8 -5.850988 27.713455 12 3 3
    pt_male= pt_bytsne.loc[df_demo['IS_FEMALE']==0, :].reset_index(drop= True)
    
    gmm = GaussianMixture(n_components=14, covariance_type='full', random_state= 42).fit(pt_female[[0,1]])
    label_dict[(2,i)]= pd.Series(gmm.predict(pt_female[[0,1]]))
    plot_results(pt_female[[0,1]], label_dict[(2,14)])
    
    # 여성들만을 군집화한 경우에도 유사한 클러스터가 형성되는 것을 확인함
    

    png

    df_female= df_item_consumed.loc[df_demo['IS_FEMALE']==1, :].drop('ID', axis= 1).reset_index(drop= True)
    
    df_female.head()
    
    H&B선물세트 VIDEOGAME 가공식품 가공우유 가구 가방브랜드 계절완구 고급 고양이용품 골프 ... 필기용품 한방차 한우선물세트 해초 헬스용품 호주산소고기 홍인삼 화과자 황태 훼이셜케어
    0 0.0 0.0 6.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    1 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0
    2 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 14.0 0.0 1.0 0.0 0.0
    3 0.0 0.0 18.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    4 0.0 0.0 23.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

    5 rows × 149 columns

    남성 사용자를 클러스터링해 보았음. 이 때 6번 클러스터와 7번 클러스터 간의 차이를 파악하였음.

    pt_male= pt_bytsne.loc[df_demo['IS_FEMALE']==0, :].reset_index(drop= True)
    df_male= df_item_consumed.loc[df_demo['IS_FEMALE']==0, :].drop('ID', axis= 1).reset_index(drop= True)
    id_male= df_raw.loc[df_raw['IS_MALE']==1, 'ID'].drop_duplicates().sort_values().reset_index(drop= True)
    df_demo_male= df_demo.loc[df_demo['ID'].isin(id_male), :].reset_index(drop= True)
    df_male.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 7003 entries, 0 to 7002
    Columns: 149 entries, H&B선물세트 to 훼이셜케어
    dtypes: float64(149)
    memory usage: 8.0 MB
    
    gmm = GaussianMixture(n_components=14, covariance_type='full', random_state= 42).fit(pt_male[[0,1]])
    label_dict[(2, 14)]= pd.Series(gmm.predict(pt_male[[0,1]]))
    plot_results(pt_male[[0,1]], label_dict[(2,14)])
    

    png

    df_byitem.loc[df_demo['IS_MALE']==1].reset_index().loc[label_dict[(2,14)]==6, :].head()
    
    index ID H&B선물세트 VIDEOGAME 가공식품 가공우유 가구 가방브랜드 계절완구 고급 ... 필기용품 한방차 한우선물세트 해초 헬스용품 호주산소고기 홍인삼 화과자 황태 훼이셜케어
    1 3 4 0.0 0.0 38.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
    12 26 29 0.0 0.0 11.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 9.0 0.0 0.0 4.0 1.0
    19 39 42 0.0 0.0 22.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 3.0 0.0 0.0 3.0 0.0 0.0 0.0 1.0 1.0
    22 46 50 0.0 0.0 9.0 2.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 1.0 0.0 15.0 14.0 0.0 0.0 0.0
    26 52 56 0.0 0.0 13.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

    5 rows × 151 columns

    6번 클러스터와 7번 클러스터 간의 지표 차이를 파악하였음.

    df_clst6= df_demo_male.loc[label_dict[(2,14)]==6, :]
    
    df_clst7= df_demo_male.loc[label_dict[(2,14)]==7, :]
    
    df_info6= df_grouped.loc[df_grouped['ID'].isin(df_clst6['ID']), :]
    df_info7= df_grouped.loc[df_grouped['ID'].isin(df_clst7['ID']), :]
    
    df_info6.head()
    
    ID RCT_NO BIZ_UNIT PD_S_C BR_C BUY_AM
    3 4 308 3 136 7 7970116
    26 29 408 2 159 5 3756730
    39 42 492 2 138 3 5425186
    46 50 566 3 238 11 10626666
    52 56 107 3 58 6 2197496
    df_clst6.AGE_PRD.value_counts().sort_index()
    
    20     55
    30    156
    40    249
    50    212
    60    129
    Name: AGE_PRD, dtype: int64
    
    df_value6= df_clst6.PRVIN.value_counts().sort_values()
    df_value7= df_clst7.PRVIN.value_counts().sort_values()
    df_ratio6= (df_value6/df_clst6.PRVIN.count()).apply(lambda x: round(x, 3))
    df_ratio7= (df_value7/df_clst7.PRVIN.count()).apply(lambda x: round(x, 3))
    
    num_of_range= range(1, df_value6.count()+1)
    grp= plt.figure(figsize= (9,12))
    ax= grp.add_subplot(211)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio6, color='skyblue')
    for i in num_of_range:
        ax.annotate(df_ratio6.values.tolist()[i-1], xy=(df_ratio6.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio6, num_of_range, "o")
    plt.yticks(num_of_range, df_value6.index)
    
    num_of_range= range(1, df_value7.count()+1)
    grp= plt.figure(figsize= (9,9))
    ax= grp.add_subplot(212)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio7, color='tomato')
    for i in num_of_range:
        ax.annotate(df_ratio7.values.tolist()[i-1], xy=(df_ratio7.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio7, num_of_range, "o", color= 'red')
    plt.yticks(num_of_range, df_value6.index)
    plt.show()
    

    png

    png

    df_value6= df_clst6.AGE_PRD.value_counts().sort_index()
    df_value7= df_clst7.AGE_PRD.value_counts().sort_index()
    df_ratio6= (df_value6/df_clst6.AGE_PRD.count()).apply(lambda x: round(x, 3))
    df_ratio7= (df_value7/df_clst7.AGE_PRD.count()).apply(lambda x: round(x, 3))
    
    num_of_range= range(1, df_value6.count()+1)
    grp= plt.figure(figsize= (9,12))
    ax= grp.add_subplot(211)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio6, color='skyblue')
    for i in num_of_range:
        ax.annotate(df_ratio6.values.tolist()[i-1], xy=(df_ratio6.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio6, num_of_range, "o")
    plt.yticks(num_of_range, df_value6.index)
    
    num_of_range= range(1, df_value7.count()+1)
    grp= plt.figure(figsize= (9,9))
    ax= grp.add_subplot(212)
    plt.hlines(y=num_of_range, xmin=0, xmax=df_ratio7, color='tomato')
    for i in num_of_range:
        ax.annotate(df_ratio7.values.tolist()[i-1], xy=(df_ratio7.values.tolist()[i-1]+0.01, i+0.01), textcoords='data')
    plt.plot(df_ratio7, num_of_range, "o", color= 'red')
    plt.yticks(num_of_range, df_value6.index)
    plt.show()
    

    png

    png

    grp= sns.distplot(df_info6['BUY_AM'], label= 'cluster 6', bins= 30, kde= False, norm_hist= True)
    grp= sns.distplot(df_info7['BUY_AM'], label= 'cluster 7', kde= False, norm_hist= True)
    plt.legend()
    plt.show()
    

    png

    grp= sns.distplot(df_info6['RCT_NO'], label= 'cluster 6', bins= 30, kde= False, norm_hist= True)
    grp= sns.distplot(df_info7['RCT_NO'], label= 'cluster 7', kde= False, norm_hist= True)
    plt.legend()
    plt.show()
    

    png

    grp= sns.distplot(df_info6['PD_S_C'], label= 'cluster 6', bins= 30, kde= True)
    grp= sns.distplot(df_info7['PD_S_C'], label= 'cluster 7', kde= True)
    plt.legend()
    plt.show()
    

    png

    grp= sns.distplot(df_info6['BR_C'], label= 'cluster 6', bins= 30, kde= True)
    grp= sns.distplot(df_info7['BR_C'], label= 'cluster 7', kde= True)
    plt.legend()
    plt.show()
    

    png

    20-40대 서울 거주 여성을 클러스터링함

    pt_specified= pt_bytsne.loc[(df_demo['IS_FEMALE']==1)&((df_demo['AGE_PRD']==20)|(df_demo['AGE_PRD']==30)|(df_demo['AGE_PRD']==40))&(df_demo['PRVIN']=='서울특별시'), :].reset_index(drop= True)
    df_specified= df_item_consumed.loc[(df_demo['IS_FEMALE']==1)&((df_demo['AGE_PRD']==20)|(df_demo['AGE_PRD']==30)|(df_demo['AGE_PRD']==40))&(df_demo['PRVIN']=='서울특별시'), :].drop('ID', axis= 1).reset_index(drop= True)
    df_specified.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 2554 entries, 0 to 2553
    Columns: 149 entries, H&B선물세트 to 훼이셜케어
    dtypes: float64(149)
    memory usage: 2.9 MB
    
    plot_results(pt_specified[[0,1]])
    

    png

    gmm = GaussianMixture(n_components=10, covariance_type='full', random_state= 42).fit(pt_specified[[0,1]])
    label_dict[(2,i)]= pd.Series(gmm.predict(pt_specified[[0,1]]))
    plot_results(pt_specified[[0,1]], label_dict[(2,10)])
    

    png

    plot_clusters(pt_specified[[0,1]], label_dict[(2,10)])
    

    png

    png

    png

    png

    png

    png

    png

    png

    png

    png