Code samples - how to experiment with knobs

v16-RF-model-adhd-17-20-clean-3-options

In [1]:

#!pip install plotly

In [85]:

import pandas as pd
import pyreadstat
import plotly.graph_objects as go
import plotly.subplots as psp
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode, iplot, plot
from plotly.subplots import make_subplots
init_notebook_mode()

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

##. Add KNN model
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor

import matplotlib.pyplot as plt



import numpy as np

from matplotlib.colors import ListedColormap

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline


from sklearn.calibration import CalibrationDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

#from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample

In [86]:

#!pip install pyreadstat
#!pip install sklearn
print ("vsersion =  {}.''")

vsersion =  {}.''

Resampling Notebook¶

Read Data files¶

In [87]:

sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf20, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_topical_SAS/nsch_2020_topical.sas7bdat')

tdf19, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2019_topical_SAS/nsch_2019_topical.sas7bdat')
tdf18, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2018_topical_SAS/nsch_2018_topical.sas7bdat')
tdf17, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2017_topical_SAS/nsch_2017_topical.sas7bdat')
#tdf16, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2016_topical_SAS/nsch_2016_topical.sas7bdat')

In [88]:

tdf20['year']=2020
tdf19['year']=2019
tdf18['year']=2018
tdf17['year']=2017

tdf19['CONFIRMINJURY'] = tdf19['K2Q46A']
tdf18['CONFIRMINJURY'] = tdf18['K2Q46A']
tdf17['CONFIRMINJURY'] = tdf17['K2Q46A']

combined_17_18 = pd.concat([tdf18,tdf17])
combined_17_19 = pd.concat([tdf19,tdf18,tdf17])
combined_all = pd.concat([tdf20,tdf19,tdf18,tdf17])

In [89]:

print (tdf17.shape, tdf18.shape, tdf19.shape, tdf20.shape)

(21599, 433) (30530, 444) (29433, 445) (42777, 444)

In [90]:

combined_17_19.head()

Out[90]:

	FIPSST	STRATUM	HHID	FORMTYPE	TOTKIDS_R	TENURE	HHLANGUAGE	SC_AGE_YEARS	SC_SEX	K2Q35A_1_YEARS	...	CYSTFIB_CURR	DOWNSYN_CURR	GENETIC_CURR	PLANNEEDS	PLANUTD	BULLIED	K7Q71_R	K7Q60_R	K7Q91_R	PLANFUTURE
0	51	2A	19000001	T1	1.0	1.0	1.0	1.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	21	1	19000002	T3	1.0	1.0	1.0	15.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	13	1	19000003	T2	1.0	1.0	1.0	6.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	47	1	19000006	T2	3.0	1.0	1.0	10.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	18	1	19000010	T1	3.0	1.0	1.0	0.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 461 columns

In [8]:

combined_all.head()

Out[8]:

	FIPSST	STRATUM	HHID	FORMTYPE	TOTKIDS_R	TENURE	HHLANGUAGE	SC_AGE_YEARS	SC_SEX	K2Q35A_1_YEARS	...	CYSTFIB_CURR	DOWNSYN_CURR	GENETIC_CURR	PLANNEEDS	PLANUTD	BULLIED	K7Q71_R	K7Q60_R	K7Q91_R	PLANFUTURE
0	17	1	20000003	T1	2.0	1.0	1.0	3.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	29	2A	20000004	T3	1.0	1.0	1.0	14.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	47	1	20000005	T1	1.0	1.0	1.0	1.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	28	1	20000014	T3	2.0	1.0	1.0	15.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	55	1	20000015	T3	2.0	2.0	1.0	16.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 470 columns

In [9]:

# Pre built CSV files
#input_file = 'NSCH_dataset/work/adhd-17-20-downsample-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-17-20-remove-missing-sc.csv'

#input_file = 'NSCH_dataset/work/adhd-20-all-sc.csv'

Feature Columns¶

In [10]:

feature_cols = ['Childs_age'
    ,'Mothers_age'
    ,'Family_structure'
    ,'Race'
    ,'Mothers_education'
    ,'Sex'
    ,'Premature'
    ,'Low_Birth_Weight'
    ,'Very_Low_Birth_Weight'
    ,'Insurance'
    ,'Headaches'
    ,'Depression'
    ,'Asthma'
    ,'Arthritis'
    ,'Anxiety'
    ,'Allergies'
    ,'Alcohol']

In [11]:

def read_adhd_data():
    #sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
    
    tdf = pd.read_csv(input_file)
    
    return tdf

def feature_subset_new(tdf):
    tdf['target'] = 2-tdf['K2Q31A']
    ## check this
    
    # BIRTH_YR does not work in 17-19-downsample
    #tdf['Childs_age'] = tdf['year'] - tdf['BIRTH_YR'].fillna(-1).astype(int) 
    #tdf['Childs_age'] = 2020 - tdf['BIRTH_YR'].fillna(-1).astype(int) 
    
    ### Remove under 4 years
    
    # v8
    #tdf = tdf[tdf['Childs_age']>4]
    
    # it was removed before Oct 25
    tdf['Childs_age'] = tdf['SC_AGE_YEARS'].fillna(-1).astype(int)
    
    tdf['Mothers_age'] = tdf['MOMAGE'].fillna(-1).astype(int)
    tdf['Family_structure'] = tdf['FAMILY_R'].fillna(-1).astype(int)
    tdf['Race'] = tdf['SC_RACER'].fillna(-1).astype(int)
    tdf['Mothers_education'] = tdf['HIGRADE'].fillna(-1).astype(int)
    tdf['Sex'] = tdf['SC_SEX'].fillna(-1).astype(int)
    tdf['Premature'] = tdf['K2Q05'].fillna(-1).astype(int)
    tdf['Low_Birth_Weight'] = tdf['BIRTHWT_L'].fillna(-1).astype(int)
    tdf['Very_Low_Birth_Weight'] = tdf['BIRTHWT_VL'].fillna(-1).astype(int)
    tdf['Insurance'] = tdf['HCCOVOTH'].fillna(-1).astype(int)
    
    ##Health Questions
    tdf['Headaches'] = tdf['HEADACHE'].fillna(2).astype(int)
    tdf['Depression'] = tdf['K2Q32A'].fillna(2).astype(int)
    #tdf['Brain_injury'] = tdf['CONFIRMINJURY'].fillna(2).astype(int)
    tdf['Asthma'] = tdf['K2Q40A'].fillna(2).astype(int)
    tdf['Arthritis'] = tdf['ARTHRITIS'].fillna(2).astype(int)
    tdf['Anxiety'] = tdf['K2Q33A'].fillna(2).astype(int)
    tdf['Allergies'] = tdf['ALLERGIES'].fillna(2).astype(int)
    tdf['Alcohol'] = tdf['ACE9'].fillna(2).astype(int)
    
    #v8 added
    #df_train = pd.DataFrame()
    #df_train = tdf[feature_cols+['target']].dropna()
    #df_train = df_train[~df_train['target'].isnull()].copy()
    
    return tdf


    
def feature_clean(df):
    df = tdf[feature_cols+['target']].dropna()
    df = df_train[~df_train['target'].isnull()].copy()



def remove_missing(tdf):
    tdf = tdf[(tdf['Mothers_age']>=0) | 
            (tdf['Family_structure']>=0)|
            (tdf['Race']>=0)|
            (tdf['Mothers_education']>=0)|
            (tdf['Sex']>=0)|
            (tdf['Premature']>=0)|
            (tdf['Low_Birth_Weight']>=0)|
            (tdf['Very_Low_Birth_Weight']>=0)|
            (tdf['Insurance']>=0)]
    
    return tdf


def downsample(tdf, sub=True):
    adhd = tdf[tdf["K2Q31A"] == 1]
    if(sub==True):
        noadhd = resample(tdf[tdf["K2Q31A"] == 2],
                          replace = False,
                          n_samples = 2 * len(adhd),
                          random_state = 50)
    else:
        noadhd = tdf[tdf["K2Q31A"] == 2]
    df = pd.concat([adhd,noadhd],axis=0)
    return df


def plot_results(pipeline, X, y, X_v1_train, X_v1_test, y_v1_train, y_v1_test, X_v1_valid= None, y_v1_valid=None):
    clf = pipeline.fit(X, y )
    

    #X_v1_train, X_v1_test, y_v1_train, y_v1_test = train_test_split(X_v1, y_v1, test_size=0.3)
    print ("X_v1_train - ",X_v1_train.shape)
    print ("X_v1_test - ",X_v1_test.shape)
    print ("y_v1_train - ",y_v1_train.shape)
    print ("y_v1_test - ",y_v1_test.shape)
    
    print ("X_v1_valid - ",X_v1_valid.shape)
    print ("y_v1_valid - ",y_v1_valid.shape)
    # Apply The Full Featured Classifier To The Test Data
    
    prediction_train = clf.predict(X_v1_train)
    ac_train = accuracy_score(y_v1_train, prediction_train)
    print ("accuracy score on training set- ",ac_train)
    
    prediction = clf.predict(X_v1_test)
    ac = accuracy_score(y_test, prediction)
    print ("accuracy score on test set- ",ac)
    
    if X_v1_valid is not None :
        prediction_valid = clf.predict(X_v1_valid)
        ac_valid = accuracy_score(y_v1_valid, prediction_valid)
        print ("accuracy score on validation set- ",ac_valid)
    
    labels_names = clf.classes_
    target_names = clf.classes_
    #print(classification_report(y_true, y_pred,labels=labels_names, target_names=target_names)) 
    #print(classification_report(X_v1_train, y_pred,labels=labels_names, target_names=target_names)) 
    
    # label should be binary (-1,1) or (0,1)
    # returns fpr, tpr, threshold
    fpr_train, tpr_train, t_train = metrics.roc_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
    fpr_test, tpr_test, t_test = metrics.roc_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
    
    if X_v1_valid is not None :
        fpr_valid, tpr_valid, t_valid = metrics.roc_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
    
    
    fig = go.Figure()
    fig.update_layout(legend=dict(
        yanchor="bottom",
        y=0,
        xanchor="right",
        x=1
    ))
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.title = 'ROC Curve'
    fig.layout.xaxis.title = 'FPR'
    fig.layout.yaxis.title = 'TRP (Sensitivity)'
    fig.add_trace(go.Scatter(
        x=fpr_train
        ,y=tpr_train
        ,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
    ))
    fig.add_trace(go.Scatter(
        x=fpr_test
        ,y=tpr_test
        ,name=f'Test AUC:{metrics.auc(fpr_test,tpr_test):,.03f}'
    ))
    fig.show()

    # Added new
    fig = go.Figure()
    fig.update_layout(legend=dict(
        yanchor="bottom",
        y=0,
        xanchor="right",
        x=1
    ))
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.title = 'ROC Curve on validation'
    fig.layout.xaxis.title = 'FPR'
    fig.layout.yaxis.title = 'TRP (Sensitivity)'
    fig.add_trace(go.Scatter(
        x=fpr_train
        ,y=tpr_train
        ,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
    ))
    fig.add_trace(go.Scatter(
        x=fpr_valid
        ,y=tpr_valid
        ,name=f'Validation AUC:{metrics.auc(fpr_valid,tpr_valid):,.03f}'
    ))
    fig.show()
    
    
    precision_train, recall_train, thresholds_train = metrics.precision_recall_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
    precision_test, recall_test, thresholds_test = metrics.precision_recall_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
    
    if X_v1_valid is not None :
        precision_valid, recall_valid, thresholds_valid = metrics.precision_recall_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])

    fig = go.Figure()
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.xaxis.title = 'Recall'
    fig.layout.yaxis.title = 'Precision'
    fig.layout.title = 'PR Curve on Test'
    fig.add_trace(go.Scatter(
        x=recall_train
        ,y=precision_train
        ,name='Train'
    ))
    fig.add_trace(go.Scatter(
        x=recall_test
        ,y=precision_test
        ,name='Test'
    ))

    fig.show()
    
    
    fig = go.Figure()
    fig.layout.height = 500
    fig.layout.width = 500
    fig.layout.xaxis.range = [0,1]
    fig.layout.yaxis.range = [0,1]
    fig.layout.xaxis.title = 'Recall'
    fig.layout.yaxis.title = 'Precision'
    fig.layout.title = 'PR Curve on Validation'
    fig.add_trace(go.Scatter(
        x=recall_train
        ,y=precision_train
        ,name='Train'
    ))
    fig.add_trace(go.Scatter(
        x=recall_valid
        ,y=precision_valid
        ,name='Valid'
    ))

    fig.show()
    
    """
    import matplotlib.pyplot as plt
    from sklearn.metrics import plot_confusion_matrix
 
    color = 'white'
    matrix = plot_confusion_matrix(knn, X_test, y_test, cmap=plt.cm.Blues)
    matrix.ax_.set_title('Confusion Matrix', color=color)
    plt.xlabel('Predicted Label', color=color)
    plt.ylabel('True Label', color=color)
    plt.gcf().axes[0].tick_params(colors=color)
    plt.gcf().axes[1].tick_params(colors=color)
    plt.show()
    """


    
    cm = metrics.confusion_matrix(y_test,pipeline.predict(X_v1_test))
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    disp.plot()
    plt.show()
    
    
    cm_valid = metrics.confusion_matrix(y_valid,pipeline.predict(X_v1_valid))
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_valid, display_labels=clf.classes_)
    disp.plot()
    plt.show()
    
    """

    print ("call 2nd confusion metrics")
    labels=clf.classes_
    print ("labels=", labels)
    plot_confusion_metric(y_v1_test, prediction, labels)
    
    """

In [12]:

train_test_val_option = 1

Option 1 (2017-2020)¶

In [13]:

# 42777,443 from ss7bdat
# 12918, 444 from csv

#. Read from file
#tdf = read_adhd_data()


# Read from Memory
if train_test_val_option == 1:
    tdf = combined_all
    output_file_name = 'combined_17_20.csv'
elif train_test_val_option == 2:
    tdf = combined_17_19
    output_file_name = 'combined_17_19.csv'
elif train_test_val_option ==3:
    tdf = combined_17_18
    output_file_name = 'combined_17_18.csv'

Option 2 (2017-2019), 2020¶

In [14]:

# option 1 - 124339,  2 - 81562  3- 59963
tdf.shape

Out[14]:

(124339, 470)

In [15]:

tdf.head()

Out[15]:

	FIPSST	STRATUM	HHID	FORMTYPE	TOTKIDS_R	TENURE	HHLANGUAGE	SC_AGE_YEARS	SC_SEX	K2Q35A_1_YEARS	...	CYSTFIB_CURR	DOWNSYN_CURR	GENETIC_CURR	PLANNEEDS	PLANUTD	BULLIED	K7Q71_R	K7Q60_R	K7Q91_R	PLANFUTURE
0	17	1	20000003	T1	2.0	1.0	1.0	3.0	1.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	29	2A	20000004	T3	1.0	1.0	1.0	14.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	47	1	20000005	T1	1.0	1.0	1.0	1.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	28	1	20000014	T3	2.0	1.0	1.0	15.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	55	1	20000015	T3	2.0	2.0	1.0	16.0	2.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 470 columns

Data Prep and cleaning¶

In [16]:

df_train = feature_subset_new(tdf)

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:9: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:22: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:24: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:25: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:26: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:27: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:28: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:29: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:30: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:31: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:32: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:35: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:36: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:38: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:39: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:40: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:41: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:42: PerformanceWarning:

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`

In [17]:

df_train.shape

Out[17]:

(124339, 488)

In [18]:

#  124339 124339 rows   2.   81562      3.  59963 to 59963
tdf2 = remove_missing(df_train)

In [19]:

tdf2.shape

Out[19]:

(124339, 488)

In [20]:

# 1 124339 to 37206. changing 14582*488.   tp. 24288*461   3. 59963 to 18099
#tdf3 = downsample(tdf2)
#tdf3.shape
#tdf = tdf3

In [21]:

tdf = tdf2

In [22]:

#df_train = feature_subset(tdf)
# 2017-2019
#df_train = feature_subset_new(tdf)

In [23]:

df_train = feature_subset_new(tdf)

In [24]:

df_train.shape

Out[24]:

(124339, 488)

In [25]:

df_train_20 = feature_subset_new(tdf20)
df_train_20 = tdf20[feature_cols+['target']].dropna()
df_train_20 = df_train_20[~df_train_20['target'].isnull()].copy()

df_train_19 = feature_subset_new(tdf19)
df_train_19 = tdf19[feature_cols+['target']].dropna()
df_train_19 = df_train_19[~df_train_19['target'].isnull()].copy()

In [26]:

# original has 19 columns
# adhd-20-all has 18 columns.  removed brain injury
df_train = tdf[feature_cols+['target']].dropna()
df_train = df_train[~df_train['target'].isnull()].copy()
df_train.head(1000)

Out[26]:

	Childs_age	Mothers_age	Family_structure	Race	Mothers_education	Sex	Premature	Low_Birth_Weight	Very_Low_Birth_Weight	Insurance	Headaches	Depression	Asthma	Arthritis	Anxiety	Allergies	Alcohol	target
0	3	26	1	1	3	1	1	2	2	2	2	2	2	2	2	2	2	0.0
1	14	31	6	1	3	2	2	2	2	2	2	2	1	2	1	2	2	0.0
2	1	28	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2	0.0
3	15	29	1	2	3	2	2	2	2	2	2	2	2	2	2	2	2	0.0
4	16	24	1	1	3	2	2	2	2	2	2	2	2	2	1	2	2	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
999	16	30	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2	0.0
1000	11	28	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2	0.0
1001	5	31	1	1	3	1	2	2	2	2	2	2	2	2	2	1	2	0.0
1002	17	-1	7	3	3	1	2	2	2	2	2	2	1	2	1	1	2	1.0
1003	9	37	1	1	2	1	2	2	2	-1	2	2	2	2	2	2	2	0.0

1000 rows × 18 columns

In [27]:

#. option 2.  reduced from 81562 to 80906
df_train.shape

Out[27]:

(123495, 18)

Write file¶

In [28]:

df_train.to_csv(output_file_name)

df_train_20.to_csv('only_20.csv')
df_train_19.to_csv('only_19.csv')

In [29]:

y = df_train['target']
#X =df_train[['Anxiety']]  # need 2d array, double bracket
X = df_train[feature_cols]
#X_train, X_test, y_train, y_test = train_test_split(column_trans, y, test_size=0.1)
#clf=RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3, class_weight={0:0.10, 1:0.90})
#clf.fit(X_train,y_train)

In [30]:

#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [31]:

import pandas_profiling as pp
pp.ProfileReport(df_train)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Out[31]:

Create Train Test Validation Set¶

In [32]:

## Option 1 - Divide 2017-2020 in train test validation set 
## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set

In [33]:

if train_test_val_option == 1 :
    # Option 1 - Divide 2017-2020 in train test validation set 
    percentage_test= .2
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=percentage_test, random_state=1)
elif train_test_val_option == 2:
    percentage_test= .2
    # 2017-2019 data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=percentage_test, random_state=1)

    y_valid = df_train_20['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_20[feature_cols]
elif train_test_val_option == 3:
    X_train = X
    y_train = y
    X_test = df_train_19[feature_cols]
    y_test = df_train_19['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_19[feature_cols]
    y_valid = df_train_20['target']
    #X =df_train[['Anxiety']]  # need 2d array, double bracket
    X_valid = df_train_20[feature_cols]

In [34]:

## Option 2 - Divide 2017-2019 in train test set , 2020 as validation set

In [ ]:

In [35]:

# option 1 79036, 24669. 19760
# option 2 64724. 16182.  42589
# option 3 59445 29246 42589
print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)

 train test validation X  (79036, 17) (24699, 17) (19760, 17)

In [36]:

print (" train test validation y " , y_train.shape, y_test.shape, y_valid.shape)

 train test validation y  (79036,) (24699,) (19760,)

Oversample¶

In [37]:

oversample_option = 1

In [38]:

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

"""
# does not work for categorical

if oversample_option == 1:
    oversample = SMOTE()
    X_train, y_train = oversample.fit_resample(X_train,y_train)
    
"""
if oversample_option == 2:
    oversample = SMOTENC(categorical_features=[0, 1,2,3,4,5,6,7],random_state=0)
    #oversample = SMOTENC(categorical_features=[0, 7], random_state=0)
    X_train, y_train = oversample.fit_resample(X_train,y_train)
    
    print ("resampled")
    #X_resampled, y_resampled = smote_nc.fit_resample(X, y)

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Input In [38], in <cell line: 1>()
----> 1 from imblearn.over_sampling import SMOTE
      2 from imblearn.over_sampling import SMOTENC
      4 """
      5 # does not work for categorical
      6 
   (...)
     10     
     11 """

ModuleNotFoundError: No module named 'imblearn'

In [39]:

print (" train test validation X " , X_train.shape, X_test.shape, X_valid.shape)

 train test validation X  (79036, 17) (24699, 17) (19760, 17)

Profile¶

In [40]:

import pandas_profiling as pp
pp.ProfileReport(X)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Out[40]:

In [41]:

pp.ProfileReport(X_test)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Out[41]:

In [42]:

pp.ProfileReport(X_valid)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Out[42]:

Model Training¶

Make column transformer¶

In [43]:

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [44]:

ct = make_column_transformer(
    (StandardScaler (), ['Childs_age', 'Mothers_age']),
    (OneHotEncoder(), ['Family_structure', 'Race', 'Mothers_education', 'Sex','Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis','Anxiety', 'Allergies',  'Alcohol' ]),
    remainder='drop'  # drop other columns
)

In [45]:

ct

Out[45]:

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

All Model Architecture¶

RF pipeline¶

In [46]:

from sklearn.ensemble import RandomForestClassifier
pipeline = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)

Gradient Boosted Decision Tree¶

In [47]:

from sklearn.ensemble import GradientBoostingClassifier

pipeline = make_pipeline(
    ct,
    GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
)

Naive SVC¶

List of modeling architecture -¶

Pipelines with feature transformtion are part of it

In [48]:

    
# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
rfc = RandomForestClassifier()
gr = GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
rfc2 = RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})

gr_pipeline = make_pipeline(
    ct,
    GradientBoostingClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
    )
knn_pipeline = make_pipeline(
    ct,
       KNeighborsClassifier(n_neighbors=10)
    )
 
    
rfc_pipeline3 = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})

    )
rfc_pipeline4 = make_pipeline(
    ct,
    RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=4,class_weight={0:0.10, 1:0.90})

    )

List of architecture to run¶

In [49]:

"""

clf_list = [
    (gr_pipeline, "Gradient"),
    (knn_pipeline, "K Nearest Neighbor"),
    (rfc_pipeline4, "Random forest 4"),
    (rfc_pipeline3, "Random forest 3"),

]

"""
clf_list = [
    (gr_pipeline, "Gradient"),
    (rfc_pipeline4, "Random forest 4"),
    (rfc_pipeline3, "Random forest 3"),
]

In [50]:

print("data is", X.shape, y.shape, X_test.shape, y_test.shape)

data is (123495, 17) (123495,) (24699, 17) (24699,)

In [51]:

#!pip install sklearn

In [52]:

from matplotlib.gridspec import GridSpec
from sklearn.metrics import DetCurveDisplay, RocCurveDisplay

fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")
ax_calibration_curve = fig.add_subplot(gs[:2, :2])
# prepare plots
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))

calibration_displays = {}

for i, (clf, name) in enumerate(clf_list):
    
    """ 
    pipeline = make_pipeline(
    ct,
    clf(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3)
    )
    pipeline.fit(X, y )
    """
    
    
    
 
    clf.fit(X_train, y_train)
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name=name)
    DetCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_det, name=name)
    

  
    display = CalibrationDisplay.from_estimator(
        clf,
        X_test,
        y_test,
        n_bins=10,
        name=name,
        ax=ax_calibration_curve,
        color=colors(i),
    )
    calibration_displays[name] = display

    

ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")

ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")
    
ax_calibration_curve.grid()
ax_calibration_curve.set_title("Calibration plots")

plt.legend()
plt.show()

In [53]:

fig = plt.figure(figsize=(10, 10))
gs = GridSpec(5, 3)
colors = plt.cm.get_cmap("Dark2")


# Add histogram
grid_positions = [(0, 0), ( 0,1), (0, 2), (2,0), (2,1 ), (2, 2),(4,0) , (4,1)]


"""
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")


# Add histogram
grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
"""

for i, (_, name) in enumerate(clf_list):
    row, col = grid_positions[i]
    ax = fig.add_subplot(gs[row, col])

    ax.hist(
        calibration_displays[name].y_prob,
        range=(0, 1),
        bins=10,
        label=name,
        color=colors(i),
    )
    ax.set(title=name, xlabel="Mean predicted probability", ylabel="Count")

plt.tight_layout()
plt.show()

In [54]:

for i, (clf, name) in enumerate(clf_list):
    print (name)
    #plot_results(clf, X, y, X_train, X_test, y_train, y_test)
    plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)

Gradient
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.9049293992610962
accuracy score on test set-  0.9084578323009028
accuracy score on validation set-  0.9035931174089069

Random forest 4
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.7912723315957285
accuracy score on test set-  0.7970363172598081
accuracy score on validation set-  0.793825910931174

Random forest 3
X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.8300015182954603
accuracy score on test set-  0.8342847888578485
accuracy score on validation set-  0.833502024291498

Model Selected¶

Calibrator¶

In [55]:

""" 
rfc_pipeline = make_pipeline( ct, RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3,class_weight={0:0.10, 1:0.90})
)
"""

#rfc_selected = rfc_pipeline4

rfc_selected = rfc_pipeline3

In [56]:

plot_results(clf, X, y, X_train, X_test, y_train, y_test, X_valid, y_valid)

X_v1_train -  (79036, 17)
X_v1_test -  (24699, 17)
y_v1_train -  (79036,)
y_v1_test -  (24699,)
X_v1_valid -  (19760, 17)
y_v1_valid -  (19760,)
accuracy score on training set-  0.8074168733235487
accuracy score on test set-  0.8108020567634318
accuracy score on validation set-  0.8092611336032388

In [63]:

for i, (clf, name) in enumerate(clf_list):
    print (name)

Gradient
Random forest 4
Random forest 3

In [64]:

clf

Out[64]:

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

standardscaler

['Childs_age', 'Mothers_age']

StandardScaler

StandardScaler()

onehotencoder

['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol']

OneHotEncoder

OneHotEncoder()

RandomForestClassifier

RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=200)

In [65]:

from joblib import dump, load

In [66]:

dump(clf, 'App/models/model_nov15.joblib')

Out[66]:

['App/models/model_nov15.joblib']

In [67]:

X_train.head()

Out[67]:

	Childs_age	Mothers_age	Family_structure	Race	Mothers_education	Sex	Premature	Low_Birth_Weight	Very_Low_Birth_Weight	Insurance	Headaches	Depression	Asthma	Arthritis	Anxiety	Allergies	Alcohol
3654	10	25	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2
22529	5	30	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2
15247	2	35	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2
12875	2	28	1	1	3	2	1	1	1	2	2	2	2	2	2	2	2
13543	14	40	1	1	3	2	2	2	2	2	1	1	1	2	1	2	2

In [68]:

#rclf = clf_list[2]

In [69]:

#rclf

In [70]:

pred_probs_train = clf.predict_proba(X_train)[:,1]
pred_probs_test = clf.predict_proba(X_test)[:,1]
pred_probs_valid = clf.predict_proba(X_valid)[:,1]

In [71]:

X_valid.head()

Out[71]:

	Childs_age	Mothers_age	Family_structure	Race	Mothers_education	Sex	Premature	Low_Birth_Weight	Very_Low_Birth_Weight	Insurance	Headaches	Depression	Asthma	Arthritis	Anxiety	Allergies	Alcohol
4375	2	30	2	1	3	1	1	1	2	2	2	2	2	2	2	2	2
22300	16	31	1	1	3	2	2	2	2	2	2	2	2	2	2	2	2
7260	10	29	5	3	3	2	2	2	2	-1	2	2	2	2	2	2	2
20418	2	23	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2
9604	17	32	1	1	3	1	2	2	2	2	2	2	2	2	2	2	2

In [72]:

pred_probs_train.shape

Out[72]:

(79036,)

In [73]:

pred_probs_test.shape

Out[73]:

(24699,)

In [74]:

pred_probs_valid.shape

Out[74]:

(19760,)

In [75]:

X_valid["score_raw"] = [x[1] for x in clf.predict_proba(X_valid)]
X_valid["target"] = y_valid
X_valid.to_csv('App/data/df_validation_baseline_nov15.csv',index=False)

In [76]:

model_str = "randomforest3"
model_to_probs = {}
model_to_probs[model_str] = {'train': pred_probs_train, 'test': pred_probs_test, 'valid': pred_probs_valid}

In [77]:

model_to_probs[model_str]

Out[77]:

{'train': array([0.46206363, 0.25033351, 0.24809988, ..., 0.45276311, 0.43508985,
        0.35771168]),
 'test': array([0.45276311, 0.25722418, 0.35730152, ..., 0.50808493, 0.33125581,
        0.43508985]),
 'valid': array([0.33975222, 0.35771168, 0.38497885, ..., 0.38614429, 0.51071714,
        0.82690017])}

In [78]:

import seaborn as sns
plt.figure(figsize=(20,4))
    
plt.subplot(1,3,1)
sns.histplot(pred_probs_train)
plt.title(f"{model_str} - train", fontsize=20)
    
plt.subplot(1,3,2)
sns.histplot(pred_probs_test)
plt.title(f"{model_str} - test", fontsize=20)
    
plt.subplot(1,3,3)
sns.histplot(pred_probs_valid)
plt.title(f"{model_str} - validation", fontsize=20)
    

Out[78]:

Text(0.5, 1.0, 'randomforest3 - validation')

In [79]:

model_str_to_trained_model = {}
model_str_to_trained_model[model_str] = clf

In [80]:

for model_str, pred_prob_dict in model_to_probs.items():
    pred_probs = pred_prob_dict['test']

    pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)

    empirical_probs = []
    pred_probs_midpoints = []

    for i in range(len(pred_probs_space)-1):
        empirical_probs.append(np.mean(y_test[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
        pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)

    plt.figure(figsize=(10,4))
    plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
    plt.title(f"{model_str}", fontsize=20)
    plt.xlabel('predicted prob', fontsize=14)
    plt.ylabel('empirical prob', fontsize=14)
    
    plt.plot([0,1],[0,1],linestyle='--',color='gray')
    
    plt.legend(['original', 'ideal'], fontsize=20)

Use Calibration¶

clf = model_str_to_trained_model['rf']¶

In [81]:

clf

Out[81]:

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

Pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['Childs_age',
                                                   'Mothers_age']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Family_structure', 'Race',
                                                   'Mothers_education', 'Sex',
                                                   'Premature',
                                                   'Low_Birth_Weight',
                                                   'Very_Low_Birth_Weight',
                                                   'Insurance', 'Headaches',
                                                   'Depression', 'Asthma',
                                                   'Arthritis', 'Anxiety',
                                                   'Allergies', 'Alcohol'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight={0: 0.1, 1: 0.9},
                                        max_depth=3, min_samples_leaf=10,
                                        min_samples_split=20,
                                        n_estimators=200))])

columntransformer: ColumnTransformer

ColumnTransformer(transformers=[('standardscaler', StandardScaler(),
                                 ['Childs_age', 'Mothers_age']),
                                ('onehotencoder', OneHotEncoder(),
                                 ['Family_structure', 'Race',
                                  'Mothers_education', 'Sex', 'Premature',
                                  'Low_Birth_Weight', 'Very_Low_Birth_Weight',
                                  'Insurance', 'Headaches', 'Depression',
                                  'Asthma', 'Arthritis', 'Anxiety', 'Allergies',
                                  'Alcohol'])])

standardscaler

['Childs_age', 'Mothers_age']

StandardScaler

StandardScaler()

onehotencoder

['Family_structure', 'Race', 'Mothers_education', 'Sex', 'Premature', 'Low_Birth_Weight', 'Very_Low_Birth_Weight', 'Insurance', 'Headaches', 'Depression', 'Asthma', 'Arthritis', 'Anxiety', 'Allergies', 'Alcohol']

OneHotEncoder

OneHotEncoder()

RandomForestClassifier

RandomForestClassifier(class_weight={0: 0.1, 1: 0.9}, max_depth=3,
                       min_samples_leaf=10, min_samples_split=20,
                       n_estimators=200)

In [82]:

### Rewrite this function
def my_prediction_calibrated(age,mage,fs,race,me,sex,pre,lwth,vlwth,insu,head,depr,ast,art,anx,alle,alc):
    
    # make a list
    x_test = pd.Series()
    
    # fill values
    x_test["Childs_age"] = age
    x_test["Mothers_age"] = mage
    x_test["Family_structure"] = fs
    x_test["Race"] = race
    x_test["Mothers_education"] = me
    x_test["Sex"] = sex
    x_test["Premature"] = pre
    x_test["Low_Birth_Weight"] = lwth
    x_test["Very_Low_Birth_Weight"] = vlwth
    x_test["Insurance"] = insu
    x_test["Headaches"] = head
    x_test["Depression"] = depr
    x_test["Asthma"] = ast
    x_test["Arthritis"] = art
    x_test["Anxiety"] = anx
    x_test["Allergies"] = alle
    x_test["Alcohol"] = alc
    
    #make prediction
    ans= pipeline.predict(X=pd.DataFrame([x_test]))
    
    return (ans)

Calibrator¶

In [83]:

model_str_to_calibrator = {}

for model_str, pred_prob_dict in model_to_probs.items():
    #train calibration model
    lr_model = LogisticRegression()
    lr_model.fit(pred_prob_dict['test'].reshape(-1,1), y_test)
    
    pred_probs = pred_prob_dict['valid']

    pred_probs_space = np.linspace(pred_probs.min(), pred_probs.max(), 10)

    empirical_probs = []
    pred_probs_midpoints = []

    for i in range(len(pred_probs_space)-1):
        empirical_probs.append(np.mean(y_valid[(pred_probs > pred_probs_space[i]) & (pred_probs < pred_probs_space[i+1])]))
        pred_probs_midpoints.append((pred_probs_space[i] + pred_probs_space[i+1])/2)

    calibrated_probs = lr_model.predict_proba(np.array([0.0]+pred_probs_midpoints+[1.0]).reshape(-1,1))[:,1]
    
    plt.figure(figsize=(10,4))
    plt.plot(pred_probs_midpoints, empirical_probs, linewidth=2, marker='o')
    plt.title(f"{model_str}", fontsize=20)
    plt.xlabel('predicted prob', fontsize=14)
    plt.ylabel('empirical prob', fontsize=14)
    
    plt.plot([0.0]+pred_probs_midpoints+[1.0], calibrated_probs, linewidth=2, marker='o')
    
    plt.plot([0,1],[0,1],linestyle='--',color='gray')
    
    plt.legend(['original', 'calibrated', 'ideal'], fontsize=20)
    
    model_str_to_calibrator[model_str] = lr_model

In [84]:

print("  complete")

  complete

In [ ]:

From the Slides blog

LLM Overview

Unleash AI's creative spark: Master GenAI 2.0 with cutting-edge updates, real-world applications, and ethical challenges.

Tame the Generative Beast: Conquer cutting-edge models, demystify real-world applications, and optimize workflows. AI fluency guaranteed. Beyond Hype, Into Hands-On: Architect your own GenAI marvels. Deep dive into foundations, dissect use cases, and master best practices. Unleash the Black Box: Unpack the power of GenAI models, dissect ethical dilemmas, and unlock hidden creative potential. Expert-level mastery awaits.

Spotlight

Futuristic interfaces

Website and chatbot

Future-proof interfaces: Build unified web-chatbot experiences that anticipate user needs and offer effortless task completion.