#### Libraries
# imports and setup
##https://stackoverflow.com/questions/18380168/center-output-plots-in-the-notebook/18401835
from IPython.display import HTML
#IPython.Cell.options_default.cm_config.lineNumbers = False;
import IPython.core.display as di
from IPython.display import Image
import os
import re
import random
import numpy as np
import pandas as pd
import seaborn as sns
#import swifter
import warnings
import time# to measure runtime of the algorithms
import itertools
import statistics as stat
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import ListedColormap
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, auc, make_scorer, recall_score, precision_score,plot_confusion_matrix, f1_score, fbeta_score, plot_roc_curve, precision_recall_curve,plot_precision_recall_curve,average_precision_score, log_loss
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut, RepeatedKFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit, cross_val_predict, learning_curve, validation_curve 
from sklearn import preprocessing
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from mlxtend.plotting import plot_decision_regions
from sklearn.cluster import KMeans

sns.set() # wil lcreate pretty matplots
%matplotlib inline
#global format settigns for plots
plt.rcParams['font.sans-serif'] = 'Arial'
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['text.color'] = '#000000'
plt.rcParams['axes.labelcolor']= '#000000'
plt.rcParams['xtick.color'] = '#000000'
plt.rcParams['ytick.color'] = '#000000'
plt.rcParams['font.size']=12
warnings.filterwarnings('ignore')
pd.set_option('precision', 3) # number precision for pandas
pd.set_option('display.max_rows', 12)
pd.set_option('display.float_format', '{:20,.3f}'.format) # get rid of scientific notation


#### EDA Functions
#Reusable function EDA
def calcProportion(dataset):
    #saving training data proportions by class in a temporary dataframe
    tempdf=pd.DataFrame(index=dataset.Tarp_Flag.unique())
    #tempdf.set_index(0)
    tempdf['Proportion'] = 0
    tempdf["Proportion"] = tempdf["Proportion"].astype(float)
    tempdf['Proportion']['Not Blue']= round((dataset.Tarp_Flag[dataset.Tarp_Flag=='Not Blue'].count()/len(dataset))*100,2)
    tempdf['Proportion']['Blue']= round((dataset.Tarp_Flag[dataset.Tarp_Flag=='Blue'].count()/len(dataset))*100,2) 
    return tempdf

##https://www.dataforeverybody.com/matplotlib-seaborn-pie-charts/
def subplotProportion(plt,tempdf1,tempdf2):
    f, axs = plt.subplots(1, 2,figsize=(12,6));#figsize=(10,6)
    axs[0].pie(x=tempdf1['Proportion'],autopct="%1.2f%%",labels=tempdf1.index,explode=(0.3,0),colors=['#FF7F50','Lightblue'])
    axs[0].set_title("Proportion of the Blue Tarps in the Training Data");
    axs[1].pie(x=tempdf2['Proportion'],autopct="%1.2f%%",labels=tempdf2.index,explode=(0.3,0),colors=['Lightblue','#FF7F50'])
    axs[1].set_title("Proportion of the Blue Tarps in the Hold Out Data");
    return plt


#### Cost Analysis Functions
#https://www.kaggle.com/badolar/using-cost-benefit-matrix-with-confusion-matrix
#https://www.kdnuggets.com/2018/10/confusion-matrices-quantify-cost-being-wrong.html
#https://www.educba.com/cost-benefit-analysis-formula/
def dataForCostAnalysis(TP,FP,TN,FN):
    df = pd.DataFrame({'TP':[TP],'FP':[FP],'TN':[TN], 'FN':[FN]})
    return df

def costAnalysis(cost_data,cost_list,method,threshold):
    data= {'Item': ['TP', 'FP', 'TN', 'FN']}
    cost_mat = pd.DataFrame(data)
    cost_mat['Cost per Person($)'] = cost_list
    cost_mat['Method & Threshold'] = str(method)+'& thresh='+str(threshold)
    cost_mat['Value'] = 0
    cost_mat.loc[0,'Value'] = cost_data['TP'].values; cost_mat.loc[1,'Value'] = cost_data['FP'].values 
    cost_mat.loc[2,'Value'] = cost_data['TN'].values; cost_mat.loc[3,'Value'] = cost_data['FN'].values
    cost_mat = cost_mat.set_index('Item')
    cost_mat['Cost']=cost_mat['Cost per Person($)']*cost_mat['Value']
    return cost_mat


#### Calculate Confusion Matrix, Optimal Threshold and other Metrics (used in predictBlueTarp)
# Reusable functions which are called in the predictBlueTarp function.
# apply threshold to positive probabilities to create labels
def to_labels(pos_probs, threshold):
	return (pos_probs > threshold).astype('int')

#computes optimal threshold based on f1 score. Called from callPrecision_Recall_Curve
def getoptimalThreshold(ax, precision, recall,thresholds, probs,y_train,beta):
    #the function creates a f1 score for the data using al lthe thresholds
    scores_t = [fbeta_score(y_train, to_labels(probs, t),beta=beta) for t in thresholds]
    # get index of best threshold with max score
    ix = np.argmax(scores_t)
    #print('\nBeta=%.2f, Best Threshold=%.3f, F-Score=%.3f' % (beta, thresholds[ix], scores_t[ix]))
    # calculate the no skill line as the proportion of the positive class
    no_skill = len(y_train[y_train==1]) / len(y_train)
    # plot the no skill precision-recall curve
    ax.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    # plot the model precision-recall curve
    ax.plot(recall, precision, marker='.', label='method')
    ax.scatter(recall[ix], precision[ix], marker='D', color='black', label='Best')
    # axis labels
    ax.set_title('Precision Recall Curve for Training Data')
    ax.set_xlabel('Recall')
    ax.set_ylabel('Precision')
    # show the legend
    ax.legend()
    return thresholds[ix]


# https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
# Function called to plot ROC and compute optimal threshold 
# Also calls getOptimalThreshold and returns it
def callPrecision_Recall_Curve(ax, fit,X,y,beta):
    y_pred = fit.predict_proba(X)
    # keep probabilities for the positive outcome only
    y_pred = y_pred[:,1]
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    best_thresh = getoptimalThreshold(ax,precision,recall,thresholds,y_pred,y,beta)
    return best_thresh

#Mainly used to fill the final table of metrics exactly as asked and plot the confusion matrix(cm) where required
#takes the best estimator and any x and y values alogn with a threshold to compute the metrics and cm. used in exploreConfusionMatrix function
#https://github.com/wcipriano/pretty-print-confusion-matrix
#https://matplotlib.org/3.1.1/tutorials/colors/colormaps.html
def getClassificationMetrics(ax,dataset,fit,X,y,y_pred,threshold):
    #compute predicted probabilities
    pred_prob = fit.predict_proba(X)
    #compute log-loass as an extra metric
    l_loss = log_loss(y, pred_prob)
    #keep probabilities for the positive outcome only
    pred_prob = pred_prob[:,1]
    CM = confusion_matrix(y,y_pred)
    TN = CM[0][0]; FN = CM[1][0] ; TP = CM[1][1]; FP = CM[0][1]
    Population = TN+FN+TP+FP
    Accuracy   = round( (TP+TN) / Population,4)
    Recall     = round( TP / (TP+FN),3 )
    FPR        = round( FP / (TN+FP),3 )
    Specificity = 1-FPR
    FDR        = round( FP / (TP+FP),3 )
    Precision  = round( TP / (TP+FP),3 )
    TNR = round(TN/(TN+FP),3)
    b_accuracy   = (Recall+TNR)/2
    #precision and recall arrays to calcualte the precision and recall auc
    precision, recall, _ = precision_recall_curve(y, pred_prob);
    #calcualting precision and recall auc and not roc_auc - very important
    auc_score = round(auc(recall,precision),4)
    if(dataset=='train'): plt.title("Matrix (Training Data) - Optimal Thresh=%.3f" %threshold)
    else: plt.title("Matrix (Hold Out Data) - Optimal Thresh=%.3f" %threshold)
    #get values from the confusion matrix inbuilt function and use it in a seaborn plot to make it look pretty
    group_counts = ["{0:0.0f}".format(value) for value in CM.flatten()]
    group_percentages = ["TN /(TN+FP) = {0:.2%}".format(TN/(TN+FP)), "FP / (TN+FP ) = {0:.2%}".format(FPR), "FN / (TP+FN) = {0:.2%}".format(FN/(TP+FN)), \
        "TP / (TP+FN) = {0:.2%}".format(Recall)]
    labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    kwargs = {'alpha':.9,'linewidth':.5,"capstyle":'projecting'}
    cmap = sns.diverging_palette(220,20,as_cmap=True)
    ax = sns.heatmap( pd.DataFrame(CM, columns=['Predicted Negative', 'Predicted Positive'], index=['Negative', 'Positive']), annot=labels, \
        annot_kws={"size": 10},cmap=cmap,fmt='',**kwargs)
    #store metrics in a data frame and return. This format will eb useful to plot the final table
    metrics = pd.DataFrame({
        'Metrics':['Accuracy','Balanced-Accuracy','AUC','Threshold','Recall','Specificity','FDR', 'Precision', 'log-loss'],\
            'Value':[Accuracy,b_accuracy, auc_score,threshold,Recall,Specificity,FDR, Precision, l_loss]})
    metrics = metrics.set_index('Metrics')
    #retun the metrics and also TP, FP, TN, FN for cost analysis
    return metrics, dataForCostAnalysis(TP,FP,TN,FN)

#Since we are using training/test data for the confusion amtrix to compute the threshold and later also want to use it for the hold out, creating one useful function
#takes the best estimator from cross validation, x,y and a threshold to call metrics and confusion matrix
def exploreConfusionMatrix(ax,dataset,fit,X,y,threshold):
    threshold = float(threshold)
    y_pred = fit.predict_proba(X)
    # keep probabilities for the positive outcome only
    y_pred = y_pred[:,1]
    pred = [ 0 if x < threshold else 1 for x in y_pred]
    #return metrics nad cost dta based on the metrics
    metrics, cost_data = getClassificationMetrics(ax,dataset,fit,X,y,pred,threshold)
    return pd.DataFrame(metrics),cost_data


#### Predict Blue Tarp or Not
#this function will predict if the RGB predictors passed are classified as blue tarp or not and computes metrics and run time for the whole process
def predictBlueTarp(plt,stat_method,grid,params,cv,scoring,X_train,y_train,X_HO,y_HO,beta):
    #measuring alogorithm time
    #TP,FP,TN,FN
    cost_list=[2000,800,0,1200]
    start = time.time()
    #prep subplots to create one pretty visualization combined and easy to read
    fig = plt.figure(figsize=(18,10))#figsize=(18,10)
    spec = gridspec.GridSpec(ncols=3, nrows=2, figure=fig)
    f_ax1 = fig.add_subplot(spec[0, 0])
    f_ax2 = fig.add_subplot(spec[0, 1])
    f_ax3 = fig.add_subplot(spec[0, 2])
    f_ax4 = fig.add_subplot(spec[1, 0])
    f_ax5 = fig.add_subplot(spec[1, 1])
    f_ax6 = fig.add_subplot(spec[1, 2])
    #perform cross validation to tune the parameters and fit the model with train/test data
    model = grid(stat_method,params,cv=kf_10,scoring=scoring,n_jobs=-1,return_train_score=True)
    result_tr = model.fit(X_train,y_train);p=result_tr.best_params_;cv_results_tab = pd.DataFrame(model.cv_results_)
    #LinearSVC doesnot have predict proba method. Therefore using CaliberatedCV to get the probabilities
    if(str(result_tr.estimator).find('LinearSVC')!=-1): 
        result_tr = CalibratedClassifierCV(result_tr,cv='prefit');result_tr.fit(X_train,y_train)
    #save as first subplot
    plt.subplot(spec[0, 0])
    #predict values using the training/test dataset and understand the confusion matrix based on the default threshold. 
    #So that will help us in determining the optimal threshold. 
    #Passing plot axes and name of the dataset to use in the plot and  its title
    exploreConfusionMatrix(f_ax1, 'train',result_tr,X_train,y_train,0.5)
    #save as second subplot
    #get optimal treshold using f1-score and plot precision recall curve
    plt.subplot(spec[0, 1])
    thresh_tr = callPrecision_Recall_Curve(f_ax2,result_tr,X_train,y_train,beta)
    #save as third subplot
    plt.subplot(spec[0,2])
    #Assign values to the predicted values based on optimal threshold thresh_tr computed usign the best model identified using k-fold
    metrics_train, cost_data_tr = exploreConfusionMatrix(f_ax3,'train',result_tr,X_train,y_train,thresh_tr)
    cost_tr_mat = costAnalysis(cost_data_tr,cost_list,stat_method,thresh_tr)
    cost_tr = cost_tr_mat['Cost'].sum()
    plt.subplot(spec[1, 0])
    sns.heatmap(pd.DataFrame(cost_tr_mat.iloc[:,3]),ax=f_ax4,annot=True,cmap=cmap,cbar=False,fmt='.2f',**kwargs)
    f_ax4.set_title('Costs on Training Confusion Matrix(Total: $'+str(cost_tr)+')')
    f_ax4.set_yticklabels(pd.DataFrame(cost_tr_mat.iloc[:,3]).index,rotation=0)
    ##Testing with Hold out using the computed threshold
    #save as fifth subplot
    plt.subplot(spec[1, 1])
    #storing metrics for hold out data and returning to print in the final table
    metrics_HO,cost_data_ho = exploreConfusionMatrix(f_ax5,'HO',result_tr,X_HO,y_HO,thresh_tr)
    #plot precision recall for hold out data
    plt.subplot(spec[1, 2])
    #https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py
    precision, recall, _ = precision_recall_curve(y_HO,result_tr.predict_proba(X_HO)[:,1])
    no_skill = len(y_HO[y_HO==1]) / len(y_HO)
    f_ax6.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
    f_ax6.plot(recall, precision, marker='.')
    f_ax6.set_title('Precision Recall Curve for Hold Out Data');f_ax6.set_xlabel('Recall');f_ax6.set_ylabel('Precision')
    #calcualte the costs with the metrics
    cost_ho_mat = costAnalysis(cost_data_ho,cost_list,stat_method,thresh_tr)
    cost_ho = cost_ho_mat['Cost'].sum()
    #show all plots
    #plt.show()
    #end the algorthm time
    end = time.time()
    #calculate the total time taken in mins. Default is in seconds
    total_time = round((end - start)/60,1)
    #return metrics for hold out data, best parameters, total time and cost to popualate the final table
    return metrics_train, metrics_HO, p , cv_results_tab, total_time, cost_tr, cost_ho


#### Function to create final table for training and hold out data
#takes all the metrics computed for each method and stores in one data frame to print finally
def createFinalResultsTable(metrics,method,runtime,cost):
    ##Creating a table to report final results of the metrics obtained from all the four methods
    data = {"Metrics":['Accuracy','Balanced-Accuracy','AUC','Threshold','Recall','Specificity','FDR','Precision','log-loss',"Run-time",'Cost'],\
        "KNN" : [0,0,0,0,0,0,0,0,0,0,0],"LDA" : [0,0,0,0,0,0,0,0,0,0,0], "QDA" : [0,0,0,0,0,0,0,0,0,0,0], "Logistic Regression":[0,0,0,0,0,0,0,0,0,0,0], \
            "RF":[0,0,0,0,0,0,0,0,0,0,0], "SVM":[0,0,0,0,0,0,0,0,0,0,0] }
    table = pd.DataFrame(data)
    table = table.set_index('Metrics')
    if(method=="Logistic Regression"):
        table["Logistic Regression"]=metrics
        table["Logistic Regression"]["Run-time"]=runtime
        table["Logistic Regression"]['Cost']=cost
    elif(method=="LDA"):
        table["LDA"]=metrics
        table["LDA"]["Run-time"]=runtime
        table["LDA"]['Cost']=cost
    elif(method=="QDA"):
        table["QDA"]=metrics
        table["QDA"]["Run-time"]=runtime
        table["QDA"]['Cost']=cost
    elif(method=="RF"):
        table["RF"]=metrics
        table["RF"]["Run-time"]=runtime
        table["RF"]['Cost']=cost
    elif(method=="SVM"):
        table["SVM"]=metrics
        table["SVM"]["Run-time"]=runtime
        table["SVM"]['Cost']=cost
    elif(method=="SVM Other"):
        method = 'SVM'
        table["SVM"]=metrics
        table["SVM"]["Run-time"]=runtime
        table["SVM"]['Cost']=cost
    else:
        table["KNN"] = metrics
        table["KNN"]["Run-time"]=runtime
        table["KNN"]['Cost']=cost
    return table[method]
    
#https://stackoverflow.com/questions/51459406/how-to-apply-standardscaler-in-pipeline-in-scikit-learn-sklearn
def comparePredictionMethod(plt,method,model,X_train,y_train,X_HO,y_HO,param,kf_10,scoring,beta,tab_tr,tab_ho):
    pipe = Pipeline([('scale', preprocessing.StandardScaler()),('clf', model)])
    metrics_tr, metrics_HO, p , tab, runtime, cost_tr, cost_ho  = predictBlueTarp(plt,pipe,GridSearchCV,param,kf_10,scoring,X_train,y_train,X_HO,y_HO,beta)
    #populate final table for SVM with the metrics for train data and hold out data
    tab_tr[str(method)] = createFinalResultsTable(metrics_tr, str(method),runtime, cost_tr)
    tab_ho[str(method)] = createFinalResultsTable(metrics_HO, str(method),runtime, cost_ho)
    if(str(method)=='SVM Other'):
        tab_tr.rename(columns={str(method):'SVM_'+str(model.kernel)+' (tuning parameters = '+str(p)+')'},inplace=True)
        tab_ho.rename(columns={str(method):'SVM_'+str(model.kernel)+' (tuning parameters = '+str(p)+')'},inplace=True)
    elif(str(method)=='SVM'):
        tab_tr.rename(columns={str(method):'Linear SVC (tuning parameters = '+str(p)+')'},inplace=True)
        tab_ho.rename(columns={str(method):'Linear SVC (tuning parameters = '+str(p)+')'},inplace=True)
    elif(str(method)=='RF'):
        final_table_tr.rename(columns={"RF":'RF (tuning parameters = '+str(p)+')'},inplace=True)
        final_table_ho.rename(columns={"RF":'RF (tuning parameters = '+str(p)+')'},inplace=True)
    elif(str(method)=='KNN'):
        n=p['clf__n_neighbors']
        final_table_tr.rename(columns={"KNN":'Knn='+str(n)},inplace=True)
        final_table_ho.rename(columns={"KNN":'Knn='+str(n)},inplace=True)
    return tab,p
#https://python-graph-gallery.com/125-small-multiples-for-line-chart/
def plotVariability(v):
    # create a color palette
    palette = plt.get_cmap('Set2')
    plt.figure(figsize=(15,8))#figsize=(20,8)
    # multiple line plot
    num=0
    for column in v:
        num+=1
        # Find the right spot on the plot
        plt.subplot(2,3, num)
        # Plot the lineplot
        plt.plot([1,2,3,4,5,6,7,8,9,10], v[column], marker='o', color=palette(num), linewidth=1.9);
        mean=v[column].mean()
        # Plot the average line
        plt.plot([1,2,3,4,5,6,7,8,9,10],[mean,mean,mean,mean,mean,mean,mean,mean,mean,mean], label='Mean', linestyle='--');
        # Add title
        plt.title(str(column)+' (score interval: '+str(np.round(v[column].min(),2))+' ,'+str(np.round(v[column].max(),2))+')', loc='center');
    plt.suptitle('10-Fold test score variability')


#### Clean Training Data
#Read CSV, look at data, check for nulls and categorize the Class variable in the data
hp = pd.read_csv("HaitiPixels.csv")
hp.Class = hp.Class.astype('category')
hp['Class'] = hp['Class'].astype('category')
#Create a new dummy variable for Blue Tarp or not 
hp['Tarp_Flag'] = ['Blue' if i == 'Blue Tarp' else 'Not Blue' for i in hp['Class']]
hp['Tarp_Flag'] = hp['Tarp_Flag'].astype('category')


#### Clean Hold Out Data
#https://www.kite.com/python/answers/how-to-set-column-names-when-importing-a-csv-into-a-pandas-dataframe-in-python
#https://stackoverflow.com/questions/18039057/python-pandas-error-tokenizing-data
labels =['X','Y','Map X','Map Y','Lat','Lon','Red','Green','Blue']
files_Non_Blue_Tarp = ['HOData\orthovnir057_ROI_NON_Blue_Tarps.txt','HOData\orthovnir067_ROI_NOT_Blue_Tarps.txt','HOData\orthovnir069_ROI_NOT_Blue_Tarps.txt',\
    'HOData\orthovnir078_ROI_NON_Blue_Tarps.txt']
files_Blue_Tarp =  ['HOData\orthovnir067_ROI_Blue_Tarps.txt','HOData\orthovnir069_ROI_Blue_Tarps.txt','HOData\orthovnir078_ROI_Blue_Tarps.txt']
ho_data_nb = pd.DataFrame()
ho_data_b = pd.DataFrame()
#Loop through blue tarps files
for filename in files_Non_Blue_Tarp:
    data = pd.read_csv(filename,  skiprows=8, header=None, names=labels, delim_whitespace=True, index_col=0)
    ho_data_nb = ho_data_nb.append(data)
#Create tarp flag for blue-tarps =0
ho_data_nb['Tarp_Flag'] = 0
#Loop through blue tarps files
for filename in files_Blue_Tarp:
    data = pd.read_csv(filename,  skiprows=8, header=None, names=labels, delim_whitespace=True, index_col=0)
    ho_data_b = ho_data_b.append(data)
#Create tarp flag for blue-tarps =1
ho_data_b['Tarp_Flag'] = 1
# Stack the DataFrames on top of each other
ho_data_final = pd.concat([ho_data_b, ho_data_nb], axis=0)
#Create a new dummy variable for Blue Tarp or not and plot the imbalance in data
ho_data_final['Tarp_Flag'] = ['Blue' if i == 1 else 'Not Blue' for i in ho_data_final['Tarp_Flag']]
ho_data_final['Tarp_Flag'] = ho_data_final['Tarp_Flag'].astype('category')


#### Exploratory Data Analysis
#calculate proportion of blue tarps in training set
tr_df = calcProportion(hp)
##calculate proportion of blue tarps in hold out set
ho_df = calcProportion(ho_data_final)
#plot the proportions for EDA
subplotProportion(plt,tr_df,ho_df);
plt.savefig('eda.png')
plt.close()


s_tr = sns.pairplot(hp,diag_kind='kde',corner=True,hue='Tarp_Flag');
s_tr.fig.suptitle('Pairwise plot for Training Data')
s_tr.savefig('hp_pairplot.png')
plt.close()


ho =  ho_data_final.iloc[:,6:]
s_ho = sns.pairplot(ho,diag_kind='kde',corner=True,hue='Tarp_Flag');
s_ho.fig.suptitle("Pairwise plot for Hold Out Data")
s_ho.savefig('ho_pairplot.png')
plt.close()


#converting flag values to 1 and 0 for ease of use
hp['Tarp_Flag'] = np.where(hp['Class'].str.contains("Blue Tarp"), 1, 0)
#converting flag values to 1 and 0 for ease of use
ho_data_final['Tarp_Flag'] = np.where(ho_data_final['Tarp_Flag'].str.contains("Not Blue"), 0, 1)


#### Setting some global variables used throughout the code
#Create train/test and hold out data set
#create a random_state variable that can be used consistently through out all the methods
beta = 2 # care twice as much about FN as FP. I personally want to sve more people.
f_score = make_scorer(fbeta_score, beta=beta)
random_state = 1;scoring = f_score
#create a stratified k-fold cross vlaidation variable which can be passed as a cross validation variable across all methods
kf_10 = StratifiedKFold(n_splits=10,shuffle=True, random_state=random_state)
#split train/test and hold out data. Train/test data will be called just train across this program
X_train = hp[['Red','Green','Blue']]; y_train = hp['Tarp_Flag']
X_HO = ho_data_final[['Red','Green','Blue']];y_HO = ho_data_final['Tarp_Flag']
#scale predictors 
X_train_sc = preprocessing.scale(X_train);X_HO_sc = preprocessing.scale(X_HO)
#initialize a table data frame to store the final table results for trainign and hold out
final_table_tr, final_table_ho=pd.DataFrame(),pd.DataFrame()
#other table to store various SVC methods
svm_tab_tr,svm_tab_ho = pd.DataFrame(),pd.DataFrame()
#creating a table to store variability results for all the methods
variability_tab,variability_tab_svm = pd.DataFrame(),pd.DataFrame()
#defining consistent cmap for various plots
#https://medium.com/@morganjonesartist/color-guide-to-seaborn-palettes-da849406d44f
kwargs = {'alpha':.9,'linewidth':.5,"capstyle":'projecting'}
cmap = sns.diverging_palette(220, 20, n=4,as_cmap=True)


#### Testing Different Methods to Compare their Performance in Predicting Blue Tarps
#LDA doesn't need hyper parameter tuning but passing components to use Gridsearch. This was easier to standerdize the cross validation function called for each method
#print('Linear Discriminant Analysis:\n')
param_lda = {'clf__n_components': [1]}
tab_lda,p_lda = comparePredictionMethod(plt,'LDA',LinearDiscriminantAnalysis(),X_train,y_train,X_HO,y_HO,param_lda,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_lda.png', dpi=150, bbox_inches='tight')
plt.close()
variability_tab = variability_tab.append(tab_lda[tab_lda['params']==p_lda].iloc[:,[29,16,17,6,7,8,9,10,11,12,13,14,15]])
#https://stackoverflow.com/questions/43641855/how-do-i-rename-an-index-row-in-python-pandas/43641960
variability_tab = variability_tab.rename({variability_tab.index[0]:'LDA'})

#QDA doesn't need hyper parameter tuning but passing components to use Gridsearch. This was easier to standerdize the cross validation function called for each method
#print('\nQuadratic Discriminant Analysis:\n')
param_qda = {'clf__reg_param': (0.00001, 0.0001, 0.001,0.01, 0.1)}
tab_qda,p_qda = comparePredictionMethod(plt,'QDA',QuadraticDiscriminantAnalysis(),X_train,y_train,X_HO,y_HO,param_qda,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_qda.png', dpi=150, bbox_inches='tight')
plt.close()
variability_tab = variability_tab.append(tab_qda[tab_qda['params']==p_qda].iloc[:,[29,16,17,6,7,8,9,10,11,12,13,14,15]])
variability_tab = variability_tab.rename({variability_tab.index[1]:'QDA'})

#KNN
#print('\nK Neighbors Classifier:\n')
param_knn = {'clf__n_neighbors':np.arange(1,100)};knn = KNeighborsClassifier()
tab_knn,p_knn = comparePredictionMethod(plt,'KNN',knn,X_train,y_train,X_HO,y_HO,param_knn,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_knn.png', dpi=150, bbox_inches='tight')
plt.close()
variability_tab = variability_tab.append(tab_knn[tab_knn['params']==p_knn].iloc[:,[29,16,17,6,7,8,9,10,11,12,13,14,15]])
variability_tab = variability_tab.rename({variability_tab.index[2]:'KNN'})

#Logistic
#https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression
#print('\nLogistic Regression:\n')
param_lr = {'clf__C': [0.001,0.01,0.1,1,10,100,1000]}
tab_lr, p_clf = comparePredictionMethod(plt,'Logistic Regression',LogisticRegression(class_weight='balanced'),X_train,y_train,X_HO,y_HO,param_lr,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_log.png', dpi=150, bbox_inches='tight')
plt.close()
#show variability stats
variability_tab = variability_tab.append(tab_lr[tab_lr['params']==p_clf].iloc[:,[29,16,17,6,7,8,9,10,11,12,13,14,15]])
variability_tab = variability_tab.rename({variability_tab.index[3]:'Logistic'})

#Random Forest
#print('\nRandom Forest:\n')
param_rf = {'clf__n_estimators': [50, 100, 300],'clf__max_depth': [5, 8, 15, 25],'clf__min_samples_leaf': [1, 2, 5]}
tab_rf,p_rf = comparePredictionMethod(plt,'RF',RandomForestClassifier(random_state=random_state,oob_score = True,class_weight='balanced_subsample'),X_train,y_train,X_HO,y_HO,param_rf,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_rf.png', dpi=150, bbox_inches='tight')
plt.close()
variability_tab = variability_tab.append(tab_rf[tab_rf['params']==p_rf].iloc[:,[31,18,19,8,9,10,11,12,13,14,15,16,17]])
variability_tab = variability_tab.rename({variability_tab.index[4]:'RF'})

#SVM
#print('\nSVM (Linear SVC):\n')
param_svm = {'clf__C':np.arange(0.01,100,10),'clf__tol':[1e-05]}
tab_lsvm,p_svm = comparePredictionMethod(plt,'SVM',LinearSVC(random_state=random_state,class_weight='balanced'),X_train,y_train,X_HO,y_HO,param_svm,kf_10,scoring,beta,final_table_tr,final_table_ho)
plt.savefig('plot_svc.png', dpi=150, bbox_inches='tight')
plt.close()
variability_tab = variability_tab.append(tab_lsvm[tab_lsvm['params']==p_svm].iloc[:,[30,17,18,7,8,9,10,11,12,13,14,15,16]])
variability_tab = variability_tab.rename({variability_tab.index[5]:'LinearSVC'})


#### Comparing different SVM Methods
#describle SVM models
model_svm_linear=SVC(kernel = 'linear', probability=True,class_weight='balanced')
model_svm_rbf=SVC(kernel = 'rbf', probability=True,class_weight='balanced')
model_svm_poly=SVC(kernel = 'poly', probability=True,class_weight='balanced')
#describe parameters for gridsearch for each mdoel
C = np.arange(0.01,100,10)
param_svm_linear = {'clf__C':C}
param_svm_rbf = {'clf__C':C,'clf__gamma': [0.01, 0.001,'scale']}
param_svm_poly = {'clf__C':C,'clf__degree':[2,3]}

#Linear
#print('SVM Linear:\n')
#call the method to predict and store meterics etc and rename the index to svm_linear
tab_svm1,p_svm1 = comparePredictionMethod(plt,'SVM Other',model_svm_linear,X_train,y_train,X_HO,y_HO,param_svm_linear,kf_10,scoring,beta,svm_tab_tr,svm_tab_ho)
plt.savefig('plot_svml.png', dpi=150, bbox_inches='tight')
plt.close()

#append resutls to variability table
variability_tab_svm = variability_tab_svm.append(tab_svm1[tab_svm1['params']==p_svm1].iloc[:,[29,16,17,6,7,8,9,10,11,12,13,14,15]])
variability_tab_svm = variability_tab_svm.rename({variability_tab_svm.index[0]:'SVM_'+str(model_svm_linear.kernel)})
#RBF
#print('\nSVM RBF:\n')
#call the method to predict and store meterics etc and rename the index to svm_rbf
tab_svm2,p_svm2 = comparePredictionMethod(plt,'SVM Other',model_svm_rbf,X_train,y_train,X_HO,y_HO,param_svm_rbf,kf_10,scoring,beta,svm_tab_tr,svm_tab_ho)
plt.savefig('plot_svmr.png', dpi=150, bbox_inches='tight')
plt.close()

#append resutls to variability table
variability_tab_svm = variability_tab_svm.append(tab_svm2[tab_svm2['params']==p_svm2].iloc[:,[30,17,18,7,8,9,10,11,12,13,14,15,16]])
variability_tab_svm = variability_tab_svm.rename({variability_tab_svm.index[1]:'SVM_rbf'})
#Poly
#print('\nSVM Poly:\n')
#call the method to predict and store meterics etc and rename the index to svm_poly
tab_svm3, p_svm3 = comparePredictionMethod(plt,'SVM Other',model_svm_poly,X_train,y_train,X_HO,y_HO,param_svm_poly,kf_10,scoring,beta,svm_tab_tr,svm_tab_ho)
plt.savefig('plot_svmp.png', dpi=150, bbox_inches='tight')
plt.close()

#append resutls to variability table
variability_tab_svm = variability_tab_svm.append(tab_svm3[tab_svm3['params']==p_svm3].iloc[:,[30,17,18,7,8,9,10,11,12,13,14,15,16]])
variability_tab_svm = variability_tab_svm.rename({variability_tab_svm.index[2]:'SVM_poly'})


#### Visualizing Decision Boundaries
#### Comparing different methods
#https://www.geeksforgeeks.org/how-to-randomly-select-rows-from-pandas-dataframe/
x_tr_sample = preprocessing.scale(np.array(X_train.iloc[:,1:3]))
y_tr_sample= np.array(y_train)
lr = LogisticRegression(random_state=random_state)
lda1 = LinearDiscriminantAnalysis()
qda1 = QuadraticDiscriminantAnalysis()
knn1 = KNeighborsClassifier(n_neighbors=5)
rf1 = RandomForestClassifier(random_state=random_state,min_samples_leaf=1, n_estimators=50, max_depth=15,oob_score=True,class_weight='balanced_subsample')
svm1 = LinearSVC(random_state=random_state,C=70.01,tol=1e-05)
#http://rasbt.github.io/mlxtend/user_guide/plotting/plot_decision_regions/
gs = gridspec.GridSpec(2, 3)
fig = plt.figure(figsize=(12,10))#figsize=(10,10)
labels = ['Logistic Regression', 'LDA','QDA', 'KNN', 'RF', 'LinearSVC']
for clf, lab, grd in zip([lr,lda1,qda1, knn1, rf1, svm1],labels,itertools.product([0, 1, 2], repeat=2)):
    clf.fit(x_tr_sample, y_tr_sample)
    ax = plt.subplot(gs[grd[0], grd[1]])
    fig = plot_decision_regions(X=x_tr_sample, y=np.array(y_tr_sample), clf=clf, legend=2,colors='orange,lightblue')
    plt.title(lab)
plt.savefig('plot_methods.png', dpi=150, bbox_inches='tight')
plt.close()


#### Visualizing Decision Boundaries
#### Comparing different SVM methods
####Note: Linear SVC doesn't have support vectors function like the other ones
s_l = SVC(kernel='linear',C=40.01,random_state=random_state)
s_r = SVC(kernel='rbf',C=80.01,random_state=random_state)
s_p = SVC(kernel='poly',C=10.01,degree=3,random_state=random_state)
l_s = LinearSVC(random_state=random_state,C=70.01,tol=1e-05)
#http://rasbt.github.io/mlxtend/user_guide/plotting/plot_decision_regions/
#https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html#sphx-glr-auto-examples-svm-plot-separating-hyperplane-unbalanced-py
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,10))#figsize=(10,10)
labels = ['SVC_Linear', 'SVC_RBF','SVC_Poly', 'LinearSVC']
for clf, lab, grd in zip([s_l,s_r,s_p, l_s],labels,itertools.product([0, 1], repeat=2)):
    clf.fit(x_tr_sample, y_tr_sample)
    ax = plt.subplot(gs[grd[0], grd[1]])
    if(clf==l_s): 
        fig = plot_decision_regions(X=x_tr_sample, y=y_tr_sample, clf=clf, legend=2,colors='orange,lightblue',zoom_factor=1)
    else: 
        fig = plot_decision_regions(X=x_tr_sample, y=y_tr_sample, clf=clf, legend=2,colors='orange,lightblue',X_highlight=clf.support_vectors_,zoom_factor=1)
    plt.title(lab)
plt.savefig('plot_svm.png', dpi=150, bbox_inches='tight')
plt.close()
#https://stackoverflow.com/questions/11854847/how-can-i-display-an-image-from-a-file-in-jupyter-notebook


plotVariability(variability_tab.iloc[:,3:13].T);
plt.savefig('plot_var.png', dpi=150, bbox_inches='tight')
plt.close()


plotVariability(variability_tab_svm.iloc[:,3:13].T);
plt.savefig('plot_svar.png', dpi=150, bbox_inches='tight')
plt.close()


#populate final table for all methods based on the training data
final_table_tr = final_table_tr.drop(labels='Run-time',axis=0)
final_table_tr


#Training
svm_tab_tr = svm_tab_tr.drop(labels='Run-time',axis=0)
svm_tab_tr


#populate final table for all methods based on the hold out data
final_table_ho = final_table_ho.drop(labels='Cost',axis=0)
final_table_ho


#Hold Out
svm_tab_ho = svm_tab_ho.drop(labels='Cost',axis=0)
svm_tab_ho


#https://seaborn.pydata.org/generated/seaborn.heatmap.html
final_table_tr.rename(columns={'Logistic Regression':"Logistic",'RF (tuning parameters = '+str(p_rf)+')':"RF",'Linear SVC (tuning parameters = '+str(p_svm)+')':"SVM"},inplace=True)
final_table_ho.rename(columns={'Logistic Regression':"Logistic",'RF (tuning parameters = '+str(p_rf)+')':"RF",'Linear SVC (tuning parameters = '+str(p_svm)+')':"SVM"},inplace=True)
f=plt.figure(figsize=(15,6))
spec = gridspec.GridSpec(ncols=2, nrows=1, figure=f)
f_ax1 = f.add_subplot(spec[0,0])
f_ax2 = f.add_subplot(spec[0,1])
plt.suptitle('Metrics Training vs Hold Out')
sns.heatmap(final_table_tr.iloc[[0,1,2,4,5,6,7,8],:],cmap=cmap,annot=True,ax=f_ax1,square=True,vmin = 0, vmax = 1,**kwargs);
f_ax1.set_yticklabels(final_table_tr.iloc[[0,1,2,4,5,6,7,8],:].index,rotation=0);
f_ax1.xaxis.tick_top();
f_ax1.set_ylabel(' ');
sns.heatmap(final_table_ho.iloc[[0,1,2,4,5,6,7,8],:],cmap=cmap,annot=True,ax=f_ax2,square=True,vmin = 0, vmax = 1,**kwargs);
f_ax2.xaxis.tick_top();
f_ax2.set_ylabel(' ');
f_ax2.set_yticklabels(final_table_tr.iloc[[0,1,2,4,5,6,7,8],:].index,rotation=0);
plt.savefig('plot_mheat.png')
plt.close()


svm_tab_tr.rename(columns={'SVM_'+str(model_svm_linear.kernel)+' (tuning parameters = '+str(p_svm1)+')':'Linear','SVM_'+str(model_svm_rbf.kernel)+' (tuning parameters = '+str(p_svm2)+')':'RBF','SVM_'+str(model_svm_poly.kernel)+' (tuning parameters = '+str(p_svm3)+')':'Poly'},inplace=True)
svm_tab_ho.rename(columns={'SVM_linear (tuning parameters = '+str(p_svm1)+')':'Linear','SVM_rbf (tuning parameters = '+str(p_svm2)+')':'RBF','SVM_poly (tuning parameters = '+str(p_svm3)+')':'Poly'},inplace=True)
f,(ax1,ax2)=plt.subplots(1,2,figsize=(10,6))#figsize=(10,6)
plt.suptitle('Metrics Training vs Hold Out (SVM Methods)')
ax1= sns.heatmap(svm_tab_tr.iloc[[0,1,2,4,5,6,7,8],:],cmap=cmap,annot=True,ax=ax1,square=True,**kwargs);
ax1.set_yticklabels(svm_tab_tr.iloc[[0,1,2,4,5,6,7,8],:].index,rotation=0);
ax1.xaxis.tick_top();
ax1.set_ylabel(' ');
ax2= sns.heatmap(svm_tab_ho.iloc[[0,1,2,4,5,6,7,8],:],cmap=cmap,annot=True,ax=ax2,square=True,**kwargs);
ax2.xaxis.tick_top();
ax2.set_ylabel(' ');
ax2.set_yticklabels(svm_tab_tr.iloc[[0,1,2,4,5,6,7,8],:].index,rotation=0);
plt.savefig('plot_smheat.png')
plt.close()


#https://stackoverflow.com/questions/22483588/how-can-i-plot-separate-pandas-dataframes-as-subplots
fig, axes = plt.subplots(nrows=1, ncols=3,figsize=(18,5))#figsize=(20,5)
ax1 = final_table_ho.iloc[8,:].plot(rot=45,ax=axes[0],marker='o',title='Log Loss on Hold Out Data',color='blue',alpha=0.3)
ax1.set_xlabel("Methods");
ax1.set_ylabel("Log Loss");
ax2 = final_table_tr.iloc[9,:].plot(rot=45,ax=axes[1],marker='o',title='Cost on Test Data', color='green',alpha=0.3)
ax2.set_xlabel("Methods");
ax2.set_ylabel("Cost($)");
ax3 = final_table_ho.iloc[9,:].plot(rot=45,ax=axes[2],marker='o', title='Runtime(Fitting on training and testing on hold out)',color='coral',alpha=0.3)
ax3.set_xlabel("Methods");
ax3.set_ylabel("Runtime(mins)");
plt.savefig('plot_m.png')
plt.close()


#https://stackoverflow.com/questions/22483588/how-can-i-plot-separate-pandas-dataframes-as-subplots
fig, axes = plt.subplots(nrows=1, ncols=3,figsize=(18,5))#, figsize=(20,5)
ax1 = svm_tab_ho.iloc[8,:].plot(rot=45,ax=axes[0],marker='o',title='Log Loss on Hold Out Data', alpha=0.3, color='Blue')
ax1.set_xlabel("SVM Methods");
ax1.set_ylabel("Log Loss");
ax2 = svm_tab_tr.iloc[9,:].plot(rot=45,ax=axes[1],marker='o',title='Cost on Test Data', alpha=0.3,color='Green')
ax2.set_xlabel("SVM Methods");
ax2.set_ylabel("Cost($)");
ax3 = svm_tab_ho.iloc[9,:].plot(rot=45,ax=axes[2],marker='o', title='Runtime(Fitting on training and testing on Hold Out)',color='coral',alpha=0.3,)
ax3.set_xlabel("SVM Methods");
ax3.set_ylabel("Runtime(mins)");
plt.savefig('plot_sm.png')
plt.close()


#References
#https://scikit-learn.org/stable/about.html#citing-scikit-learn
#https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv
#https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
#https://towardsdatascience.com/optimizing-hyperparameters-in-random-forest-classification-ec7741f9d3f6
#https://medium.com/@lily_su/random-forests-with-decision-trees-bagging-and-gradient-boosting-with-sklearn-and-xgboost-4e0f057dc7b3
#https://www.analyticsvidhya.com/blog/2020/03/beginners-guide-random-forest-hyperparameter-tuning/
#https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#sphx-glr-auto-examples-classification-plot-classifier-comparison-py
#https://bradleyboehmke.github.io/HOML/random-forest.html
#https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py
#https://www.dummies.com/programming/big-data/data-science/data-science-how-to-set-up-a-support-vector-machine-predictive-model-in-python/
#https://dev-aux.com/python/how-to-predict_proba-with-linearsvc
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
#https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
##https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py
#https://stats.stackexchange.com/questions/263393/scikit-correct-way-to-calibrate-classifiers-with-calibratedclassifiercv
#https://towardsdatascience.com/imbalanced-class-sizes-and-classification-models-a-cautionary-tale-part-2-cf371500d1b3
#https://towardsdatascience.com/imbalanced-class-sizes-and-classification-models-a-cautionary-tale-3648b8586e03
#https://imbalanced-learn.readthedocs.io/en/stable/combine.html
#https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
#https://machinelearningmastery.com/imbalanced-classification-with-python-7-day-mini-course/
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py <br>
#https://www.kdnuggets.com/2017/09/visualizing-cross-validation-code.html
#https://datascienceplus.com/how-to-perform-logistic-regression-lda-qda-in-r/#:~:text=LDA%20(Linear%20Discriminant%20Analysis)%20is,for%20all%20class%20is%20normal.
#https://stats.stackexchange.com/questions/276067/whats-considered-a-good-log-loss
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

	LDA	QDA	Knn=5	Logistic Regression	RF (tuning parameters = {'clf__max_depth': 15, 'clf__min_samples_leaf': 5, 'clf__n_estimators': 50})	Linear SVC (tuning parameters = {'clf__C': 70.01, 'clf__tol': 1e-05})
Metrics
Accuracy	0.982	0.987	0.992	0.990	0.998	0.991
Balanced-Accuracy	0.921	0.974	0.996	0.983	0.992	0.981
AUC	0.859	0.966	0.996	0.969	0.995	0.972
Threshold	0.066	0.038	0.200	0.642	0.713	0.084
Recall	0.857	0.960	1.000	0.976	0.986	0.971
Specificity	0.986	0.988	0.992	0.990	0.998	0.991
FDR	0.332	0.277	0.195	0.233	0.054	0.213
Precision	0.668	0.723	0.805	0.767	0.946	0.787
log-loss	0.069	0.017	0.005	0.054	0.014	0.015
Cost	4,500,800.000	4,575,200.000	4,436,800.000	4,486,400.000	4,112,000.000	4,422,400.000

	SVM_linear (tuning parameters = {'clf__C': 40.01})	SVM_rbf (tuning parameters = {'clf__C': 80.01, 'clf__gamma': 'scale'})	SVM_poly (tuning parameters = {'clf__C': 70.01, 'clf__degree': 3})
Metrics
Accuracy	0.989	0.992	0.992
Balanced-Accuracy	0.984	0.991	0.980
AUC	0.966	0.989	0.985
Threshold	0.053	0.201	0.114
Recall	0.978	0.990	0.966
Specificity	0.989	0.992	0.993
FDR	0.246	0.196	0.186
Precision	0.754	0.804	0.814
log-loss	0.018	0.009	0.016
Cost	4,524,800.000	4,417,600.000	4,346,400.000

	LDA	QDA	Knn=5	Logistic Regression	RF (tuning parameters = {'clf__max_depth': 15, 'clf__min_samples_leaf': 5, 'clf__n_estimators': 50})	Linear SVC (tuning parameters = {'clf__C': 70.01, 'clf__tol': 1e-05})
Metrics
Accuracy	0.977	0.975	0.977	0.950	0.994	0.946
Balanced-Accuracy	0.946	0.931	0.953	0.975	0.883	0.973
AUC	0.683	0.762	0.689	0.970	0.672	0.971
Threshold	0.066	0.038	0.200	0.642	0.713	0.084
Recall	0.915	0.887	0.929	1.000	0.771	1.000
Specificity	0.977	0.975	0.977	0.950	0.996	0.946
FDR	0.773	0.793	0.769	0.873	0.424	0.881
Precision	0.227	0.207	0.231	0.127	0.576	0.119
log-loss	0.074	0.016	0.120	0.173	0.040	0.026
Run-time	1.100	2.700	6.300	0.600	3.800	0.900

	SVM_linear (tuning parameters = {'clf__C': 40.01})	SVM_rbf (tuning parameters = {'clf__C': 80.01, 'clf__gamma': 'scale'})	SVM_poly (tuning parameters = {'clf__C': 70.01, 'clf__degree': 3})
Metrics
Accuracy	0.957	0.977	0.981
Balanced-Accuracy	0.978	0.820	0.990
AUC	0.968	0.512	0.960
Threshold	0.053	0.201	0.114
Recall	1.000	0.660	1.000
Specificity	0.957	0.979	0.981
FDR	0.855	0.811	0.722
Precision	0.145	0.189	0.278
log-loss	0.020	0.032	0.039
Run-time	10.600	49.900	61.600

Machine Learning Models Comparison using Haiti Earthquake Data¶

Objective¶

Which algorithms appear to be reasonable/good choices for use according to cross-validation?¶

Which algorithms actually ended up performing well on the hold-out data?¶

Tuning parameters for Random Forest and SVM methods:¶

Confusion Matrices and Precision-Recall Curves for Various methods¶

Linear Discriminant Analysis¶

Quadratic Discriminant Analysis¶

K Nearest Neighbor¶

Logistic Regression¶

Random Forest¶

LinearSVC¶

SVM Linear¶

SVM RBF¶

SVM Poly¶

Compare training and hold out results¶

Final Recommendation¶

Relevance of the metrics¶

Future consdierations¶

Code¶

Final Training Results (Table 2)¶

SVM Training Results (Table 2)¶

Final Hold Out Results (Table 3)¶

SVM Hold Out Results (Table 3)¶

References¶