# -*- coding: utf-8 -*-
"""
Created on Fri Apr 28 19:53:41 2023

@author: zhang lanxin
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import xgboost as xgb
import os

#X
X = pd.read_csv('Features-Exp.csv') #('Features-Exp.csv')

#Y
Y = pd.read_csv('Abnormal-Exp.csv') #('Abnormal-Exp.csv')
# Y = pd.read_csv('Abnormal-Exp.csv')

# correlation matrix
corr_matrix = X.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr_matrix, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(360, 9, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
s1 = sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
p1 = s1.get_figure()
p1.savefig('Heatmap-Exp.png',dpi=600)

## PCA
#normalize the X values
X_normalize = StandardScaler().fit_transform(X)

#apply PCA to visulaize the data in cluster
from sklearn.decomposition import PCA
#find explained variance and n_components

pca = PCA()
pca.fit(X_normalize)
plt.figure(figsize=(15, 10))
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.ylim(-0.5, 2)
plt.title('PCA explained variance vs. n componets')

#observe distribution when n_componets =2
pca_2 = PCA(2)
projected = pca_2.fit_transform(X_normalize)
print(X.shape)
print(projected.shape)
plt.figure(figsize=(15, 10))
plt.scatter(projected[:, 0], projected[:,1])
plt.xlabel('PCA (Dim=1)')
plt.ylabel('PCA (Dim=2)')
plt.title('Clustering of data when PCA dimension =2')


########
'''
#Sometimes no standardscaler is better for the classifciation result
'''
from sklearn.preprocessing import Normalizer, MinMaxScaler

X_train, X_test,  y_train, y_test = train_test_split(X, Y, test_size=0.29, random_state=36)

# standS = StandardScaler().fit(X_train)
# X_train_std = standS.fit_transform(X_train)
# X_test_std = standS.fit_transform(X_test)


import itertools

# Create a confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')



def predict_driving_performance(X, Y, predictType):
    X_train, X_test,  y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=24)
    X_test1, X_val, y_test1, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=24)
    seed= 24
    xgb1 = xgb.sklearn.XGBClassifier(
        learning_rate =0.1,
        n_estimators=10, #100
        max_depth=5,
        min_child_weight=11,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.7,
        objective='multi:softprob',
        num_class= 2,
        n_jobs= -1,
        scale_pos_weight=1,
        seed=seed)
    #we will use xgboost to do the classification
    xgb1.fit(X_train, y_train)
    
    y_pred1 = xgb1.predict(X_val)
    print(y_pred1)
    #y_pred1 = int(y_pred)
    #confusion matrix and classification report
    
    cm = confusion_matrix(y_val, y_pred1)
    
    print(classification_report(y_val, y_pred1))
    
    labels = ['Anomaly','Normal']
    fig = plt.figure(figsize=(11.963,9.3))
    fig.add_subplot(211)
    plot_confusion_matrix(cm, labels, title="Confusion Matrix---XGBoost", cmap=plt.cm.Greens)
    #plt.savefig('XGboost1-Exp',dpi=600)
    ###All test
    y_pred2 = xgb1.predict(X_test)
    print(y_pred2)
    #confusion matrix and classification report
    
    cm = confusion_matrix(y_test, y_pred2)
    
    print(classification_report(y_test, y_pred2))
    
    labels = ['Anomaly','Normal']
    fig = plt.figure(figsize=(11.963,9.3))
    fig.add_subplot(211)
    plot_confusion_matrix(cm, labels, title="Confusion Matrix---XGBoost2", cmap=plt.cm.Greens)
    #plt.savefig('XGboost2-Exp',dpi=600)
    
    fig, ax = plt.subplots(figsize = (15, 10))
    ind = np.arange(6) #8
    width = 0.15
    feature_importtance = xgb1.feature_importances_
    rect1 = ax.bar(ind+width, feature_importtance)
    ax.set_xticks(ind+width/2)
    ax.set_xticklabels(('carCenterXm','carCenterYm','speed(m/s)','angle','2DTTC','distance'))
    ax.set_ylabel('Importance')
    ax.set_xlabel('Measurements')
    ax.set_title('Relative Importance of various measurements to predict ' + predictType)
    #plt.show()
    #plt.savefig('Important features-Exp',dpi=600)

#Let's predict
predict_driving_performance(X, Y, 'Abnormal driving behaviour')


'''
Plot confusion matrix
'''
import itertools

# Create a confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    #plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

t = []
f1score = []
cm_record = []
classification_report_record = []
FPR_record = []
TPR_record = []
ACC_record = []

# fit the model
from sklearn.ensemble import IsolationForest
import time








#
normal_index  =  np.array( Y == 0 )
abnormal_index = np.array( Y != 0 )
X_normal = X[normal_index]
X_outliers = X[abnormal_index]
y_normal = Y[normal_index]
y_outliers = Y[abnormal_index]

X_normal_train, X_normal_valid,  y_normal_train, y_normal_valid = train_test_split(X_normal, y_normal, test_size=0.80, random_state=24)
X_val =  pd.concat([X_normal_valid, X_outliers], axis=0)





map_v = {0:0, 1:1, 2:1}

y_val_v =  pd.concat([y_normal_valid, y_outliers], axis=0)
y_val_v1 = y_val_v['Label'].map(map_v)
y_trian_try = y_normal_train['Label'].map(map_v)
y_train = y_train['Label'].map(map_v)
y_test = y_test['Label'].map(map_v)
y_normal_valid_try = y_normal_valid['Label'].map(map_v)
y_all = Y['Label'].map(map_v)





'''
RandomForestClassifier
'''
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()  #

t0 = time.time()


rfc.fit(X_train, y_train)

# Here no std better 
#clf.fit(X_train) #X_train X_train_std  clf.fit(X_normal_train) 
#X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()

t_rfc = t1 - t0
print('#################################################################')
print("Random Forest {:.3} s".format(t1 - t0))
t.append(t_rfc)


#y_pred= xgb1.predict(X_val) # X_test

y_pred= rfc.predict(X_val)

#confusion matrix and classification report

# cm = confusion_matrix(y_val, y_pred) #y_test
cm = confusion_matrix(-y_val_v1, -y_pred) #y_test


#print(classification_report(y_val, y_pred)) #y_test
print(classification_report(y_val_v1, y_pred)) #y_test
print(f1_score(y_val_v1, y_pred)) #y_test

# print(f1_score(y_val_v1, y_pred_val))

print("F1:")
print(f1_score(-(y_val_v1-1), -(y_pred-1)))

print("roc_auc_score:")
# print(roc_auc_score(-(y_test-1), -(y_pred-1)))

FPR = cm[1][0] / (cm[1][0] + cm[1][1])

#TPR = TP /(TP + FN)  
TPR = cm[0][0] / (cm[0][0] + cm[0][1])
ACC = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
Precision = cm[0][0] / (cm[1][0] + cm[0][0])
Recall = cm[0][0] / (cm[0][1] + cm[0][0])


#Define first
# FPR_record = []
# TPR_record = []
# ACC_record = []


FPR_record.append(FPR)
TPR_record.append(TPR)
ACC_record.append(ACC)

print('FPR:')
print(FPR)

print('TPR:')
print(TPR)

print('ACC:')
print(ACC)


#fig = plt.figure(figsize=(11.963,9.3))
#fig.add_subplot(999)
labels = ['Anomaly','Normal']

fig = plt.figure()
plot_confusion_matrix(cm, labels, title="Confusion Matrix---Random Forest", cmap=plt.cm.Blues) # Blues






t0 = time.time()


rfc.fit(X_train, y_train)

# Here no std better 
#clf.fit(X_train) #X_train X_train_std  clf.fit(X_normal_train) 
#X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()

t_rfc = t1 - t0
print('#################################################################')
print("Random Forest {:.3} s".format(t1 - t0))
t.append(t_rfc)


#y_pred= xgb1.predict(X_val) # X_test

y_pred= rfc.predict(X_test)

#confusion matrix and classification report

# cm = confusion_matrix(y_val, y_pred) #y_test
cm = confusion_matrix(-y_test, -y_pred) #y_test


#print(classification_report(y_val, y_pred)) #y_test
print(classification_report(y_test, y_pred)) #y_test
print(f1_score(y_test, y_pred)) #y_test

# print(f1_score(y_val_v1, y_pred_val))

print("F1:")
print(f1_score(-(y_test-1), -(y_pred-1)))

print("roc_auc_score:")
# print(roc_auc_score(-(y_test-1), -(y_pred-1)))

FPR = cm[1][0] / (cm[1][0] + cm[1][1])

#TPR = TP /(TP + FN)  
TPR = cm[0][0] / (cm[0][0] + cm[0][1])
ACC = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
Precision = cm[0][0] / (cm[1][0] + cm[0][0])
Recall = cm[0][0] / (cm[0][1] + cm[0][0])


#Define first
# FPR_record = []
# TPR_record = []
# ACC_record = []


FPR_record.append(FPR)
TPR_record.append(TPR)
ACC_record.append(ACC)

print('FPR:')
print(FPR)

print('TPR:')
print(TPR)

print('ACC:')
print(ACC)


#fig = plt.figure(figsize=(11.963,9.3))
#fig.add_subplot(999)
labels = ['Anomaly','Normal']

fig = plt.figure()
plot_confusion_matrix(cm, labels, title="Confusion Matrix---Random Forest", cmap=plt.cm.Blues) # Blues







#### XGBoost

seed= 36

xgb2 = xgb.sklearn.XGBClassifier(    learning_rate =0.1,
    n_estimators=10, #100
    max_depth=3,
    min_child_weight=11,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.7,
    objective='multi:softprob',
    num_class= 2,
    n_jobs= -1,
    scale_pos_weight=1,
    seed=seed)

#we will use xgboost to do the classification
# xgb2.fit(X_train, y_train)

# 

t0 = time.time()


xgb2.fit(X_train, y_train)

# Here no std better 
#clf.fit(X_train) #X_train X_train_std  clf.fit(X_normal_train) 
#X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()

t_rfc = t1 - t0
print('#################################################################')
print("Random Forest {:.3} s".format(t1 - t0))
t.append(t_rfc)


#y_pred= xgb1.predict(X_val) # X_test

y_pred= xgb2.predict(X_val)

#confusion matrix and classification report

# cm = confusion_matrix(y_val, y_pred) #y_test
cm = confusion_matrix(-y_val_v1, -y_pred) #y_test


#print(classification_report(y_val, y_pred)) #y_test
print(classification_report(y_val_v1, y_pred)) #y_test
print(f1_score(y_val_v1, y_pred)) #y_test

# print(f1_score(y_val_v1, y_pred_val))

print("F1:")
print(f1_score(-(y_val_v1-1), -(y_pred-1)))

print("roc_auc_score:")
# print(roc_auc_score(-(y_test-1), -(y_pred-1)))

FPR = cm[1][0] / (cm[1][0] + cm[1][1])

#TPR = TP /(TP + FN)  
TPR = cm[0][0] / (cm[0][0] + cm[0][1])
ACC = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
Precision = cm[0][0] / (cm[1][0] + cm[0][0])
Recall = cm[0][0] / (cm[0][1] + cm[0][0])


#Define first
# FPR_record = []
# TPR_record = []
# ACC_record = []


FPR_record.append(FPR)
TPR_record.append(TPR)
ACC_record.append(ACC)

print('FPR:')
print(FPR)

print('TPR:')
print(TPR)

print('ACC:')
print(ACC)


#fig = plt.figure(figsize=(11.963,9.3))
#fig.add_subplot(999)
labels = ['Anomaly','Normal']

fig = plt.figure()
plot_confusion_matrix(cm, labels, title="Confusion Matrix---XGBoost", cmap=plt.cm.Greens) # Blues






t0 = time.time()


xgb2.fit(X_train, y_train)

# Here no std better 
#clf.fit(X_train) #X_train X_train_std  clf.fit(X_normal_train) 
#X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()

t_rfc = t1 - t0
print('#################################################################')
print("Random Forest {:.3} s".format(t1 - t0))
t.append(t_rfc)


#y_pred= xgb1.predict(X_val) # X_test

y_pred= xgb2.predict(X_test)

#confusion matrix and classification report

# cm = confusion_matrix(y_val, y_pred) #y_test
cm = confusion_matrix(-y_test, -y_pred) #y_test


#print(classification_report(y_val, y_pred)) #y_test
print(classification_report(y_test, y_pred)) #y_test
print(f1_score(y_test, y_pred)) #y_test

# print(f1_score(y_val_v1, y_pred_val))

print("F1:")
print(f1_score(-(y_test-1), -(y_pred-1)))

print("roc_auc_score:")
# print(roc_auc_score(-(y_test-1), -(y_pred-1)))

FPR = cm[1][0] / (cm[1][0] + cm[1][1])

#TPR = TP /(TP + FN)  
TPR = cm[0][0] / (cm[0][0] + cm[0][1])
ACC = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1])
Precision = cm[0][0] / (cm[1][0] + cm[0][0])
Recall = cm[0][0] / (cm[0][1] + cm[0][0])


#Define first
# FPR_record = []
# TPR_record = []
# ACC_record = []


FPR_record.append(FPR)
TPR_record.append(TPR)
ACC_record.append(ACC)

print('FPR:')
print(FPR)

print('TPR:')
print(TPR)

print('ACC:')
print(ACC)


#fig = plt.figure(figsize=(11.963,9.3))
#fig.add_subplot(999)
labels = ['Anomaly','Normal']

fig = plt.figure()
plot_confusion_matrix(cm, labels, title="Confusion Matrix---XGBoost", cmap=plt.cm.Greens) # Blues

