# -*- coding: utf-8 -*-
"""
Created on Tue Nov 3 10:01:10 2020
@author: phulh5
"""
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from dmba import classificationSummary
from alive_progress import alive_bar
from tqdm import tqdm
import pickle
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import pandasql as ps
from imblearn.over_sampling import SMOTE
#set view full rows
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
#import dataset
df1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_komua_random.csv")
df2 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_muagoi.csv")
df = pd.concat([df1,df2])
df = pd.DataFrame(df)
#head data
df.head(n=10)
df.tail(n=6)
df = df.drop(columns = 'randnum')
df = df.drop(columns = 'home_tinh')
#summary
df.describe()
#check all columns
df.columns
########Data dimension reduction - Principal Components Analysis##############
#groupby count
pcs = PCA()
pcs.fit(df.iloc[:,1:119].dropna(axis = 0))
pcsSummary_df = pd.DataFrame(
{'Standard deviation': np.sqrt(pcs.explained_variance_),
'Proportion of variance': pcs.explained_variance_ratio_,
'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)}
)
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC' + str(i) for i
in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)
##Normalizing (Standardizing) and Rescaling Data######################
from sklearn.preprocessing import MinMaxScaler, StandardScaler
missing_cnt = df.isnull().sum()
df = df.fillna(0)
norm_df = (df.drop(columns = 'isdn') - df.drop(columns ='isdn').min())/ (df.drop(columns ='isdn').max() - df.drop(columns ='isdn').min())
norm_df.head(6)
#or use scikit-learn:
scaler = MinMaxScaler()
norm_df_v1 = pd.DataFrame(scaler.fit_transform(df),
index= df.index,
columns = df.columns)
compare = pd.concat([norm_df['tuoi_khach_hang_200901'],
norm_df_v1['tuoi_khach_hang_200901']
],
axis = 1,
keys = ['norm_df', 'norm_df_v1']
)
##########################################################case_when_if#####################################
norm_df.head()
conditions = [
(df['BHTT'] == 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] == 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] == 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] == 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] == 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] == 1),
(df['BHTT'] != 1) & (df['CHTT'] != 1) & (df['CHUOI'] != 1) & (df['CHUQ'] != 1) & (df['TELESALE'] != 1) & (df['DB'] != 1)
] # create a list of our conditions
values = ['1', '0', '2', '0', '3', '0', '4', '0', '5', '0', '6', '0'] # create a list of the values we want to assign for each condition
df['Purchased'] = np.select(conditions, values) # create a new column and use np.select to assign values to it using our lists as arguments
#method2
def func_case_when (norm_df):
if norm_df['BHTT'] == 1 :
return 1
elif norm_df['CHTT'] == 1 :
return 2
elif norm_df['CHUOI'] == 1 :
return 3
elif norm_df['CHUQ'] == 1 :
return 4
elif norm_df['TELESALE'] == 1 :
return 5
elif norm_df['DB'] == 1 :
return 6
else :
return 0
def fun_purchased (norm_df):
if norm_df['bhtt'] == 1:
return 1
elif norm_df['telesale'] == 1:
return 2
else:
return 0
norm_df['purchased_channel'] = norm_df.apply(fun_purchased, axis = 1)
norm_df.head(6)
norm_df.purchased_channel.value_counts()
#to drop a column
df_accident = df_accident.drop(columns = ['Unnamed'])
df_accident = df_accident.drop(df_accident.columns[0], axis = 1)
#equivalent str
a = df.dtypes
#unique
df['thiet_bi_200901'].unique()
df['muagoi'].unique()
#convert to category
norm_df.purchased_channel = norm_df.purchased_channel.astype('category')
norm_df.dtypes
norm_df.describe()
#
norm_df.head(n=2)
#convert the categorical data to dummy variables
df_processed = pd.get_dummies(df, columns= ['thiet_bi_200901'])
df = df_processed.drop(columns= ['Purchase'])
df = df_processed.drop(columns= ['CH'])
df_processed = df_processed.drop(columns= ['CHUOI'])
df_processed = df_processed.drop(columns= ['CTV'])
df_processed = df_processed.drop(columns= ['DB'])
df_processed = df_processed.drop(columns= ['TELESALE'])
processed.dtypes
df['Purchased'] = df.Purchased.astype('category')
norm_df['muagoi'] = df.muagoi.astype('category')
#missing data
norm_df.head( n = 10)
df.hatang_1G_200901
norm_df_count_missing = norm_df.isnull().sum()
#
norm_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/DL_train_full.csv")
norm_df.head(5)
#'=' for str/dataframe, not in for a list
outcome = ['purchased_channel']
predictors = [c for
c in norm_df.columns
if
c not in ['kenh_mua', 'bhtt', 'telesale', 'muagoi','isdn','purchased_channel']]
#partition data
x = norm_df[predictors]
y = norm_df[outcome]
train_x, valid_x, train_y, valid_y = train_test_split(x,
y,
test_size = 0.4,
random_state = 1)
train_y.value_counts()
######OVERSAMPLING#
norm_df.groupby('kenh_mua').isdn.nunique()
norm_df.groupby('muagoi').isdn.nunique()
train_x = train_x.astype('int')
train_y = train_y.astype('int')
d = train_x.dtypes
train_y.value_counts()
oversample = SMOTE(sampling_strategy = {1 : 500000, 2: 350000})
train_x_resample, train_y_resample = oversample.fit_resample(train_x,
train_y)
train_y_resample.value_counts()
d = train_x_resample.dtypes
###### train neural network with 2 hidden nodes ######
##activation 'tanh' is the hyperbolic tangent, it is usually better to scale predictors to a [−1, 1] scale
##Deep learning uses mostly the ReLU (rectified linear unit) activation function or variants of it. This function is
#identical to the linear function, but set to zero for s < 0
clf = MLPClassifier(hidden_layer_sizes= (100,30),
activation= 'logistic',
solver='lbfgs',
random_state = 1)
clf.fit(train_x_resample, train_y_resample)
prediction = clf.predict(train_x)
# training performance (use idxmax to revert the one-hotencoding)
classificationSummary(train_y, clf.predict(train_x))
# validation performance
classificationSummary(valid_y, clf.predict(valid_x))
prediction = clf.predict_proba(valid_x)
prediction = pd.DataFrame(prediction)
prediction.value_counts()
valid_y.value_counts()
def predict_funs (prediction):
if prediction.loc[0] >= 0.98 :
return 0
else:
return 1
prediction['purchased_pre'] = prediction.apply(predict_funs, axis = 1)
prediction.purchased_pre.value_counts()
print(pd.concat([
valid_x,
pd.DataFrame(clf.predict_proba(valid_x))
], axis=1))
prediction = (pd.concat([
train_x,
pd.DataFrame(clf.predict_proba(train_x))
], axis=1))
prediction = clf.predict_proba(train_x)
np.unique(prediction, return_counts= True)
np.unique(df_accident['TELESALE'], return_counts= True)
prediction = pd.DataFrame(prediction)
prediction.head(n=6)
names = [c for c in df.columns]
# Save to file
pickle.dump(clf, open('G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/python_model.sav', 'wb'))
# Load from file
clf = pickle.load(open('G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/python_model.sav', 'rb'))
########################################
############Valid dataset ##############
########################################
valid_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test.csv")
df_columns = [c for c in valid_df.columns]
df_columns = pd.DataFrame(df_columns)
df_columns.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/Data_model/column_name.csv", index = False, header = True)
valid_df = valid_df.rename(columns = {
"isdn_201001":"isdn_200901",
"tuoi_khach_hang_200901":"tuoi_khach_hang_200801",
"thuc_4g_200901":"thuc_4g_200801",
"is_3k3d_200901":"is_3k3d_200801",
"is_199_197_200901":"is_199_197_200801",
"so_ngay_su_dung_200901":"so_ngay_su_dung_200801",
"tong_cuoc_goc_data_4_huong_20090":"tong_cuoc_goc_data_4_huong_20080",
"so_lan_nap_the_200901":"so_lan_nap_the_200801",
"tong_tien_nap_the_200901":"tong_tien_nap_the_200801",
"so_du_tkg_200901":"so_du_tkg_200801",
"thuc_4g_200801":"thuc_4g_200701",
"is_3k3d_200801":"is_3k3d_200701",
"is_199_197_200801":"is_199_197_200701",
"so_ngay_su_dung_200801":"so_ngay_su_dung_200701",
"tong_cuoc_goc_data_4_huong_20080":"tong_cuoc_goc_data_4_huong_20070",
"so_lan_nap_the_200801":"so_lan_nap_the_200701",
"tong_tien_nap_the_200801":"tong_tien_nap_the_200701",
"so_du_tkg_200801":"so_du_tkg_200701",
"thuc_4g_201001":"thuc_4g_200901",
"is_3k3d_201001":"is_3k3d_200901",
"is_199_197_201001":"is_199_197_200901",
"so_ngay_su_dung_201001":"so_ngay_su_dung_200901",
"tong_cuoc_goc_data_4_huong_20100":"tong_cuoc_goc_data_4_huong_20090",
"so_lan_nap_the_201001":"so_lan_nap_the_200901",
"tong_tien_nap_the_201001":"tong_tien_nap_the_200901",
"so_du_tkg_201001":"so_du_tkg_200901",
"muagoi_200701":"muagoi_200701",
"muagoi_200601":"muagoi_200601",
"muagoi_200801":"muagoi_200701",
"tilethoainoi_200901":"tilethoainoi_200801",
"tilethoainoi_200801":"tilethoainoi_200701",
"tilethoainoi_201001":"tilethoainoi_200901",
"thoai_mp200901":"thoai_mp200801",
"thoai_mp200801":"thoai_mp200701",
"thoai_mp201001":"thoai_mp200901",
"thietbi_3g_200901":"thietbi_3g_200801",
"thietbi_2g_200901":"thietbi_2g_200801",
"thietbi_4g_200901":"thietbi_4g_200801",
"thietbi_unknown_200901":"thietbi_unknown_200801",
"thietbi_3g_200801":"thietbi_3g_200701",
"thietbi_2g_200801":"thietbi_2g_200701",
"thietbi_4g_200801":"thietbi_4g_200701",
"thietbi_unknown_200801":"thietbi_unknown_200701",
"thietbi_3g_201001":"thietbi_3g_200901",
"thietbi_2g_201001":"thietbi_2g_200901",
"thietbi_4g_201001":"thietbi_4g_200901",
"thietbi_unknown_201001":"thietbi_unknown_200901",
"hatang_2g_200901":"hatang_2g_200801",
"hatang_3g_200901":"hatang_3g_200801",
"hatang_4g_200901":"hatang_4g_200801",
"hatang_2g_200801":"hatang_2g_200701",
"hatang_3g_200801":"hatang_3g_200701",
"hatang_4g_200801":"hatang_4g_200701",
"hatang_2g_201001":"hatang_2g_200901",
"hatang_3g_201001":"hatang_3g_200901",
"hatang_4g_201001":"hatang_4g_200901",
"loaimay_fp_200901":"loaimay_fp_200801",
"loaimay_sp_200901":"loaimay_sp_200801",
"loaimay_e_200901":"loaimay_e_200801",
"loaimay_u_200901":"loaimay_u_200801",
"loaimay_fp_200801":"loaimay_fp_200701",
"loaimay_sp_200801":"loaimay_sp_200701",
"loaimay_e_200801":"loaimay_e_200701",
"loaimay_u_200801":"loaimay_u_200701",
"loaimay_fp_201001":"loaimay_fp_200901",
"loaimay_sp_201001":"loaimay_sp_200901",
"loaimay_e_201001":"loaimay_e_200901",
"loaimay_u_201001":"loaimay_u_200901",
"home_tinh":"home_tinh",
"Bac_Bo":"Bac_Bo",
"bac_tb":"bac_tb",
"db":"db",
"dbscl":"dbscl",
"dbsh":"dbsh",
"dnbo":"dnbo",
"nam_tb":"nam_tb",
"tay_bac":"tay_bac",
"tay_nguyen":"tay_nguyen",
"thichthoai_200901":"thichthoai_200801",
"thichthoai_200801":"thichthoai_200701",
"thichthoai_201001":"thichthoai_200901",
"thichdata_200901":"thichdata_200801",
"thichdata_200801":"thichdata_200701",
"thichdata_201001":"thichdata_200901",
"tdthap_200901":"tdthap_200801",
"tdtb_200901":"tdtb_200801",
"tdcao_200901":"tdcao_200801",
"tdthap_200801":"tdthap_200701",
"tdtb_200801":"tdtb_200701",
"tdcao_200801":"tdcao_200701",
"tdthap_201001":"tdthap_200901",
"tdtb_201001":"tdtb_200901",
"tdcao_201001":"tdcao_200901",
"llthoaithap_200901":"llthoaithap_200801",
"llthoaitb_200901":"llthoaitb_200801",
"llthoaicao_200901":"llthoaicao_200801",
"llthoaithap_200801":"llthoaithap_200701",
"llthoaitb_200801":"llthoaitb_200701",
"llthoaicao_200801":"llthoaicao_200701",
"llthoaithap_201001":"llthoaithap_200901",
"llthoaitb_201001":"llthoaitb_200901",
"llthoaicao_201001":"llthoaicao_200901",
"tdthoaithap_200901":"tdthoaithap_200801",
"tdthoaitb_200901":"tdthoaitb_200801",
"tdthoaicao_200901":"tdthoaicao_200801",
"tdthoaithap_200801":"tdthoaithap_200701",
"tdthoaitb_200801":"tdthoaitb_200701",
"tdthoaicao_200801":"tdthoaicao_200701",
"tdthoaithap_201001":"tdthoaithap_200901",
"tdthoaitb_201001":"tdthoaitb_200901",
"tdthoaicao_201001":"tdthoaicao_200901",
"tb50mb_200901":"tb50mb_200801",
"tb50mb_200801":"tb50mb_200701",
"tb50mb_201001":"tb50mb_200901"
})
valid_df.head(3)
a = valid_df.isnull().sum()
valid_df = valid_df.fillna(0)
valid_df.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test_v1.csv", index = False, header = True)
valid_df = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/DL_test_v1.csv")
valid_df = valid_df.drop(columns = 'home_tinh')
valid_df['tdthoaithap_200901'].unique()
#nomalized
a = valid_df.dtypes
valid_df = valid_df.astype('int')
valid_df.head()
normalized_df = valid_df_v2.drop(columns = 'isdn_200901')
normalized_df=(valid_df.drop(columns = 'isdn_200901') - valid_df.drop(columns = 'isdn_200901').min())/(valid_df.drop(columns = 'isdn_200901').max() - valid_df.drop(columns = 'isdn_200901').min())
a = normalized_df.isnull().sum()
normalized_df = normalized_df.fillna(0)
normalized_df.head(3)
print(valid_df['isdn_200901'])
prediction = (pd.concat([
valid_df['isdn_200901'],
pd.DataFrame(clf.predict_proba(normalized_df))
], axis=1))
prediction = prediction.rename(columns = {
"isdn_200901" : "isdn",
"0" : "Not_purchase",
"1" : "bhtt",
"2" : "telesale"
})
prediction.head( n=10 )
prediction.tail( n=10 )
prediction.to_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p2.csv", index = False, header = True)
nn_pre_p1 = prediction[['isdn', 0, 1, 2, 3, 4, 5, 6]]
nn_pre_p1.head( n=6 )
prediction_p1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p1.csv")
nn_pre_p1 = pd.read_csv("G:/Phu/Ad_hoc/Phantich_Kenh/Kenh_T12/nn_pre_p1.csv")
nn_pre_p2 = prediction
nn_pre_p2 = nn_pre_p2.rename (columns = {'isdn_200901' : 'isdn'})
nn_pre_p1 = pd.DataFrame(nn_pre_p1)
nn_pre_p2 = pd.DataFrame(nn_pre_p2)
nn_pre = nn_pre_p1.append(nn_pre_p2)