Friday, 25 December 2020

Py_Neuron Network (Simple ANN)

 # -*- coding: utf-8 -*-

"""

Created on Tue Nov  3 10:01:10 2020


@author: phulh5

"""


import sys        

import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier

from dmba import classificationSummary

from alive_progress import alive_bar

from tqdm import tqdm

import pickle

from sklearn.decomposition import PCA

from sklearn import preprocessing

from sklearn.preprocessing import MinMaxScaler, StandardScaler

import pandasql as ps

from imblearn.over_sampling import SMOTE


#set view full rows

pd.set_option('display.max_rows', 100)

pd.set_option('display.max_columns', 100)


#import dataset     

df1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_komua_random.csv")

df2 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_muagoi.csv")


df = pd.concat([df1,df2])


df = pd.DataFrame(df)


#head data

df.head(n=10)

df.tail(n=6)


df = df.drop(columns = 'randnum')

df = df.drop(columns = 'home_tinh')


#summary

df.describe()


#check all columns 

df.columns


########Data dimension reduction - Principal Components Analysis##############

#groupby count


pcs = PCA()


pcs.fit(df.iloc[:,1:119].dropna(axis = 0))


pcsSummary_df = pd.DataFrame(

    {'Standard deviation': np.sqrt(pcs.explained_variance_),

     'Proportion of variance': pcs.explained_variance_ratio_,

     'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)}

    )

    

pcsSummary_df = pcsSummary_df.transpose()


pcsSummary_df.columns = ['PC' + str(i) for i 

                         in range(1, len(pcsSummary_df.columns) + 1)]


pcsSummary_df.round(4)


##Normalizing (Standardizing) and Rescaling Data######################

from sklearn.preprocessing import MinMaxScaler, StandardScaler


missing_cnt = df.isnull().sum()


df = df.fillna(0)


norm_df = (df.drop(columns = 'isdn') - df.drop(columns ='isdn').min())/ (df.drop(columns ='isdn').max() - df.drop(columns ='isdn').min())


norm_df.head(6)


#or use scikit-learn:

scaler = MinMaxScaler()

norm_df_v1 = pd.DataFrame(scaler.fit_transform(df),

                          index= df.index,

                          columns = df.columns)


compare = pd.concat([norm_df['tuoi_khach_hang_200901'], 

                       norm_df_v1['tuoi_khach_hang_200901']

                       ],

                    axis = 1,

                    keys = ['norm_df', 'norm_df_v1']

                    )


##########################################################case_when_if#####################################

norm_df.head()


conditions = [

    (df['BHTT'] == 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] == 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] == 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] == 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] == 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] == 1),

    (df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1)

    ] # create a list of our conditions


values = ['1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0'] # create a list of the values we want to assign for each condition



df['Purchased'] = np.select(conditions, values) # create a new column and use np.select to assign values to it using our lists as arguments


#method2


def func_case_when (norm_df):

    if norm_df['BHTT'] == 1 :

        return 1

    elif norm_df['CHTT'] == 1 :

        return 2

    elif norm_df['CHUOI'] == 1 :

        return 3

    elif norm_df['CHUQ'] == 1 :

        return 4

    elif norm_df['TELESALE'] == 1 :

        return 5

    elif norm_df['DB'] == 1 :

        return 6

    else :

        return 0


def fun_purchased (norm_df):

    if norm_df['bhtt'] == 1:

        return 1

    elif norm_df['telesale'] == 1:

        return 2

    else:

        return 0

    

norm_df['purchased_channel'] = norm_df.apply(fun_purchased, axis = 1)


norm_df.head(6)


norm_df.purchased_channel.value_counts()


#to drop a column

df_accident = df_accident.drop(columns = ['Unnamed'])

df_accident = df_accident.drop(df_accident.columns[0], axis = 1)



#equivalent str

a = df.dtypes


#unique

df['thiet_bi_200901'].unique()

df['muagoi'].unique()


#convert to category

norm_df.purchased_channel = norm_df.purchased_channel.astype('category')


norm_df.dtypes


norm_df.describe()

#


norm_df.head(n=2)


#convert the categorical data to dummy variables

df_processed = pd.get_dummies(df, columns= ['thiet_bi_200901'])

df = df_processed.drop(columns= ['Purchase'])

df = df_processed.drop(columns= ['CH'])


df_processed = df_processed.drop(columns= ['CHUOI'])

df_processed = df_processed.drop(columns= ['CTV'])

df_processed = df_processed.drop(columns= ['DB'])

df_processed = df_processed.drop(columns= ['TELESALE'])


processed.dtypes

df['Purchased'] = df.Purchased.astype('category')


norm_df['muagoi'] = df.muagoi.astype('category')


#missing data

norm_df.head( n = 10)

df.hatang_1G_200901

norm_df_count_missing = norm_df.isnull().sum()


#

norm_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/DL_train_full.csv")


norm_df.head(5)

#'=' for str/dataframe, not in for a list

outcome = ['purchased_channel']


predictors = [c for 

             c in norm_df.columns 

             if 

             c not in ['kenh_mua', 'bhtt', 'telesale', 'muagoi','isdn','purchased_channel']]





#partition data

x = norm_df[predictors]

y = norm_df[outcome]


train_x, valid_x, train_y, valid_y = train_test_split(x, 

                                                      y, 

                                                      test_size = 0.4, 

                                                      random_state = 1)


train_y.value_counts()



######OVERSAMPLING#


norm_df.groupby('kenh_mua').isdn.nunique()

norm_df.groupby('muagoi').isdn.nunique()


train_x = train_x.astype('int')

train_y = train_y.astype('int')


d = train_x.dtypes


train_y.value_counts()


oversample = SMOTE(sampling_strategy = {1 : 500000, 2: 350000})


train_x_resample, train_y_resample = oversample.fit_resample(train_x, 

                                                             train_y)


train_y_resample.value_counts()


d = train_x_resample.dtypes


###### train neural network with 2 hidden nodes ######

##activation 'tanh' is the hyperbolic tangent, it is usually better to scale predictors to a [−1, 1] scale

##Deep learning uses mostly the ReLU (rectified linear unit) activation function or variants of it. This function is

#identical to the linear function, but set to zero for s < 0


clf = MLPClassifier(hidden_layer_sizes= (100,30),

                    activation= 'logistic',

                    solver='lbfgs',

                    random_state = 1)


clf.fit(train_x_resample, train_y_resample)


prediction = clf.predict(train_x)


# training performance (use idxmax to revert the one-hotencoding)

classificationSummary(train_y, clf.predict(train_x))

# validation performance

classificationSummary(valid_y, clf.predict(valid_x))


prediction = clf.predict_proba(valid_x)


prediction = pd.DataFrame(prediction)


prediction.value_counts()


valid_y.value_counts()


def predict_funs (prediction):

    if prediction.loc[0] >= 0.98 :

        return 0

    else:

        return 1


prediction['purchased_pre'] = prediction.apply(predict_funs, axis = 1) 


prediction.purchased_pre.value_counts()

    

print(pd.concat([

    valid_x,

    pd.DataFrame(clf.predict_proba(valid_x))

], axis=1))


prediction = (pd.concat([

    train_x,

    pd.DataFrame(clf.predict_proba(train_x))

], axis=1))


prediction = clf.predict_proba(train_x)


np.unique(prediction, return_counts= True)


np.unique(df_accident['TELESALE'], return_counts= True)


prediction = pd.DataFrame(prediction)


prediction.head(n=6)


names = [c for c in df.columns]


# Save to file

pickle.dump(clf, open('G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/python_model.sav', 'wb'))


# Load from file

clf = pickle.load(open('G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/python_model.sav', 'rb'))


########################################

############Valid dataset ##############

########################################


valid_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test.csv")


df_columns = [c for c in valid_df.columns]


df_columns = pd.DataFrame(df_columns)


df_columns.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/column_name.csv", index = False, header = True)


valid_df = valid_df.rename(columns = {

"isdn_201001":"isdn_200901",

"tuoi_khach_hang_200901":"tuoi_khach_hang_200801",

"thuc_4g_200901":"thuc_4g_200801",

"is_3k3d_200901":"is_3k3d_200801",

"is_199_197_200901":"is_199_197_200801",

"so_ngay_su_dung_200901":"so_ngay_su_dung_200801",

"tong_cuoc_goc_data_4_huong_20090":"tong_cuoc_goc_data_4_huong_20080",

"so_lan_nap_the_200901":"so_lan_nap_the_200801",

"tong_tien_nap_the_200901":"tong_tien_nap_the_200801",

"so_du_tkg_200901":"so_du_tkg_200801",

"thuc_4g_200801":"thuc_4g_200701",

"is_3k3d_200801":"is_3k3d_200701",

"is_199_197_200801":"is_199_197_200701",

"so_ngay_su_dung_200801":"so_ngay_su_dung_200701",

"tong_cuoc_goc_data_4_huong_20080":"tong_cuoc_goc_data_4_huong_20070",

"so_lan_nap_the_200801":"so_lan_nap_the_200701",

"tong_tien_nap_the_200801":"tong_tien_nap_the_200701",

"so_du_tkg_200801":"so_du_tkg_200701",

"thuc_4g_201001":"thuc_4g_200901",

"is_3k3d_201001":"is_3k3d_200901",

"is_199_197_201001":"is_199_197_200901",

"so_ngay_su_dung_201001":"so_ngay_su_dung_200901",

"tong_cuoc_goc_data_4_huong_20100":"tong_cuoc_goc_data_4_huong_20090",

"so_lan_nap_the_201001":"so_lan_nap_the_200901",

"tong_tien_nap_the_201001":"tong_tien_nap_the_200901",

"so_du_tkg_201001":"so_du_tkg_200901",

"muagoi_200701":"muagoi_200701",

"muagoi_200601":"muagoi_200601",

"muagoi_200801":"muagoi_200701",

"tilethoainoi_200901":"tilethoainoi_200801",

"tilethoainoi_200801":"tilethoainoi_200701",

"tilethoainoi_201001":"tilethoainoi_200901",

"thoai_mp200901":"thoai_mp200801",

"thoai_mp200801":"thoai_mp200701",

"thoai_mp201001":"thoai_mp200901",

"thietbi_3g_200901":"thietbi_3g_200801",

"thietbi_2g_200901":"thietbi_2g_200801",

"thietbi_4g_200901":"thietbi_4g_200801",

"thietbi_unknown_200901":"thietbi_unknown_200801",

"thietbi_3g_200801":"thietbi_3g_200701",

"thietbi_2g_200801":"thietbi_2g_200701",

"thietbi_4g_200801":"thietbi_4g_200701",

"thietbi_unknown_200801":"thietbi_unknown_200701",

"thietbi_3g_201001":"thietbi_3g_200901",

"thietbi_2g_201001":"thietbi_2g_200901",

"thietbi_4g_201001":"thietbi_4g_200901",

"thietbi_unknown_201001":"thietbi_unknown_200901",

"hatang_2g_200901":"hatang_2g_200801",

"hatang_3g_200901":"hatang_3g_200801",

"hatang_4g_200901":"hatang_4g_200801",

"hatang_2g_200801":"hatang_2g_200701",

"hatang_3g_200801":"hatang_3g_200701",

"hatang_4g_200801":"hatang_4g_200701",

"hatang_2g_201001":"hatang_2g_200901",

"hatang_3g_201001":"hatang_3g_200901",

"hatang_4g_201001":"hatang_4g_200901",

"loaimay_fp_200901":"loaimay_fp_200801",

"loaimay_sp_200901":"loaimay_sp_200801",

"loaimay_e_200901":"loaimay_e_200801",

"loaimay_u_200901":"loaimay_u_200801",

"loaimay_fp_200801":"loaimay_fp_200701",

"loaimay_sp_200801":"loaimay_sp_200701",

"loaimay_e_200801":"loaimay_e_200701",

"loaimay_u_200801":"loaimay_u_200701",

"loaimay_fp_201001":"loaimay_fp_200901",

"loaimay_sp_201001":"loaimay_sp_200901",

"loaimay_e_201001":"loaimay_e_200901",

"loaimay_u_201001":"loaimay_u_200901",

"home_tinh":"home_tinh",

"Bac_Bo":"Bac_Bo",

"bac_tb":"bac_tb",

"db":"db",

"dbscl":"dbscl",

"dbsh":"dbsh",

"dnbo":"dnbo",

"nam_tb":"nam_tb",

"tay_bac":"tay_bac",

"tay_nguyen":"tay_nguyen",

"thichthoai_200901":"thichthoai_200801",

"thichthoai_200801":"thichthoai_200701",

"thichthoai_201001":"thichthoai_200901",

"thichdata_200901":"thichdata_200801",

"thichdata_200801":"thichdata_200701",

"thichdata_201001":"thichdata_200901",

"tdthap_200901":"tdthap_200801",

"tdtb_200901":"tdtb_200801",

"tdcao_200901":"tdcao_200801",

"tdthap_200801":"tdthap_200701",

"tdtb_200801":"tdtb_200701",

"tdcao_200801":"tdcao_200701",

"tdthap_201001":"tdthap_200901",

"tdtb_201001":"tdtb_200901",

"tdcao_201001":"tdcao_200901",

"llthoaithap_200901":"llthoaithap_200801",

"llthoaitb_200901":"llthoaitb_200801",

"llthoaicao_200901":"llthoaicao_200801",

"llthoaithap_200801":"llthoaithap_200701",

"llthoaitb_200801":"llthoaitb_200701",

"llthoaicao_200801":"llthoaicao_200701",

"llthoaithap_201001":"llthoaithap_200901",

"llthoaitb_201001":"llthoaitb_200901",

"llthoaicao_201001":"llthoaicao_200901",

"tdthoaithap_200901":"tdthoaithap_200801",

"tdthoaitb_200901":"tdthoaitb_200801",

"tdthoaicao_200901":"tdthoaicao_200801",

"tdthoaithap_200801":"tdthoaithap_200701",

"tdthoaitb_200801":"tdthoaitb_200701",

"tdthoaicao_200801":"tdthoaicao_200701",

"tdthoaithap_201001":"tdthoaithap_200901",

"tdthoaitb_201001":"tdthoaitb_200901",

"tdthoaicao_201001":"tdthoaicao_200901",

"tb50mb_200901":"tb50mb_200801",

"tb50mb_200801":"tb50mb_200701",

"tb50mb_201001":"tb50mb_200901"

    })


valid_df.head(3)


a = valid_df.isnull().sum()


valid_df = valid_df.fillna(0)


valid_df.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test_v1.csv", index = False, header = True)


valid_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test_v1.csv")


valid_df = valid_df.drop(columns = 'home_tinh')


valid_df['tdthoaithap_200901'].unique()


#nomalized


a = valid_df.dtypes


valid_df = valid_df.astype('int')


valid_df.head()


normalized_df = valid_df_v2.drop(columns = 'isdn_200901')


normalized_df=(valid_df.drop(columns = 'isdn_200901') - valid_df.drop(columns = 'isdn_200901').min())/(valid_df.drop(columns = 'isdn_200901').max() - valid_df.drop(columns = 'isdn_200901').min())


a = normalized_df.isnull().sum()


normalized_df = normalized_df.fillna(0)


normalized_df.head(3)


print(valid_df['isdn_200901'])


prediction = (pd.concat([

    valid_df['isdn_200901'],

    pd.DataFrame(clf.predict_proba(normalized_df))

], axis=1))


prediction = prediction.rename(columns = {

    "isdn_200901" : "isdn",

    "0" : "Not_purchase",

    "1" : "bhtt",

    "2" : "telesale"

    })


prediction.head( n=10 )

prediction.tail( n=10 )


prediction.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p2.csv", index = False, header = True)


nn_pre_p1 = prediction[['isdn', 0, 1, 2, 3, 4, 5, 6]]


nn_pre_p1.head( n=6 )


prediction_p1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p1.csv")


nn_pre_p1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p1.csv")

nn_pre_p2 = prediction


nn_pre_p2 = nn_pre_p2.rename (columns = {'isdn_200901' : 'isdn'})


nn_pre_p1 = pd.DataFrame(nn_pre_p1)

nn_pre_p2 = pd.DataFrame(nn_pre_p2)


nn_pre = nn_pre_p1.append(nn_pre_p2)