Code samples - How to Experiment With Knobs
In [1]:
#!pip install plotly
In [85]:
import pandas as pd
import pyreadstat
import plotly.graph_objects as go
import plotly.subplots as psp
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'
from plotly.offline import init_notebook_mode, iplot, plot
from plotly.subplots import make_subplots
init_notebook_mode()
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
##. Add KNN model
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibrationDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
#from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.utils import resample
In [86]:
#!pip install pyreadstat
#!pip install sklearn
print ("vsersion = {}.''")
vsersion = {}.'' Resampling Notebook¶Read Data files¶In [87]:
sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf20, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_topical_SAS/nsch_2020_topical.sas7bdat')
tdf19, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2019_topical_SAS/nsch_2019_topical.sas7bdat')
tdf18, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2018_topical_SAS/nsch_2018_topical.sas7bdat')
tdf17, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2017_topical_SAS/nsch_2017_topical.sas7bdat')
#tdf16, tmeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2016_topical_SAS/nsch_2016_topical.sas7bdat')
In [88]:
tdf20['year']=2020
tdf19['year']=2019
tdf18['year']=2018
tdf17['year']=2017
tdf19['CONFIRMINJURY'] = tdf19['K2Q46A']
tdf18['CONFIRMINJURY'] = tdf18['K2Q46A']
tdf17['CONFIRMINJURY'] = tdf17['K2Q46A']
combined_17_18 = pd.concat([tdf18,tdf17])
combined_17_19 = pd.concat([tdf19,tdf18,tdf17])
combined_all = pd.concat([tdf20,tdf19,tdf18,tdf17])
In [89]:
print (tdf17.shape, tdf18.shape, tdf19.shape, tdf20.shape)
(21599, 433) (30530, 444) (29433, 445) (42777, 444) In [90]:
combined_17_19.head()
Out[90]:
5 rows × 461 columns In [8]:
combined_all.head()
Out[8]:
5 rows × 470 columns In [9]:
# Pre built CSV files
#input_file = 'NSCH_dataset/work/adhd-17-20-downsample-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-17-20-remove-missing-sc.csv'
#input_file = 'NSCH_dataset/work/adhd-20-all-sc.csv'
Feature Columns¶In [10]:
feature_cols = ['Childs_age'
,'Mothers_age'
,'Family_structure'
,'Race'
,'Mothers_education'
,'Sex'
,'Premature'
,'Low_Birth_Weight'
,'Very_Low_Birth_Weight'
,'Insurance'
,'Headaches'
,'Depression'
,'Asthma'
,'Arthritis'
,'Anxiety'
,'Allergies'
,'Alcohol']
In [11]:
def read_adhd_data():
#sdf, smeta = pyreadstat.read_sas7bdat('NSCH_dataset/nsch_2020_screener_SAS/nsch_2020_screener.sas7bdat')
tdf = pd.read_csv(input_file)
return tdf
def feature_subset_new(tdf):
tdf['target'] = 2-tdf['K2Q31A']
## check this
# BIRTH_YR does not work in 17-19-downsample
#tdf['Childs_age'] = tdf['year'] - tdf['BIRTH_YR'].fillna(-1).astype(int)
#tdf['Childs_age'] = 2020 - tdf['BIRTH_YR'].fillna(-1).astype(int)
### Remove under 4 years
# v8
#tdf = tdf[tdf['Childs_age']>4]
# it was removed before Oct 25
tdf['Childs_age'] = tdf['SC_AGE_YEARS'].fillna(-1).astype(int)
tdf['Mothers_age'] = tdf['MOMAGE'].fillna(-1).astype(int)
tdf['Family_structure'] = tdf['FAMILY_R'].fillna(-1).astype(int)
tdf['Race'] = tdf['SC_RACER'].fillna(-1).astype(int)
tdf['Mothers_education'] = tdf['HIGRADE'].fillna(-1).astype(int)
tdf['Sex'] = tdf['SC_SEX'].fillna(-1).astype(int)
tdf['Premature'] = tdf['K2Q05'].fillna(-1).astype(int)
tdf['Low_Birth_Weight'] = tdf['BIRTHWT_L'].fillna(-1).astype(int)
tdf['Very_Low_Birth_Weight'] = tdf['BIRTHWT_VL'].fillna(-1).astype(int)
tdf['Insurance'] = tdf['HCCOVOTH'].fillna(-1).astype(int)
##Health Questions
tdf['Headaches'] = tdf['HEADACHE'].fillna(2).astype(int)
tdf['Depression'] = tdf['K2Q32A'].fillna(2).astype(int)
#tdf['Brain_injury'] = tdf['CONFIRMINJURY'].fillna(2).astype(int)
tdf['Asthma'] = tdf['K2Q40A'].fillna(2).astype(int)
tdf['Arthritis'] = tdf['ARTHRITIS'].fillna(2).astype(int)
tdf['Anxiety'] = tdf['K2Q33A'].fillna(2).astype(int)
tdf['Allergies'] = tdf['ALLERGIES'].fillna(2).astype(int)
tdf['Alcohol'] = tdf['ACE9'].fillna(2).astype(int)
#v8 added
#df_train = pd.DataFrame()
#df_train = tdf[feature_cols+['target']].dropna()
#df_train = df_train[~df_train['target'].isnull()].copy()
return tdf
def feature_clean(df):
df = tdf[feature_cols+['target']].dropna()
df = df_train[~df_train['target'].isnull()].copy()
def remove_missing(tdf):
tdf = tdf[(tdf['Mothers_age']>=0) |
(tdf['Family_structure']>=0)|
(tdf['Race']>=0)|
(tdf['Mothers_education']>=0)|
(tdf['Sex']>=0)|
(tdf['Premature']>=0)|
(tdf['Low_Birth_Weight']>=0)|
(tdf['Very_Low_Birth_Weight']>=0)|
(tdf['Insurance']>=0)]
return tdf
def downsample(tdf, sub=True):
adhd = tdf[tdf["K2Q31A"] == 1]
if(sub==True):
noadhd = resample(tdf[tdf["K2Q31A"] == 2],
replace = False,
n_samples = 2 * len(adhd),
random_state = 50)
else:
noadhd = tdf[tdf["K2Q31A"] == 2]
df = pd.concat([adhd,noadhd],axis=0)
return df
def plot_results(pipeline, X, y, X_v1_train, X_v1_test, y_v1_train, y_v1_test, X_v1_valid= None, y_v1_valid=None):
clf = pipeline.fit(X, y )
#X_v1_train, X_v1_test, y_v1_train, y_v1_test = train_test_split(X_v1, y_v1, test_size=0.3)
print ("X_v1_train - ",X_v1_train.shape)
print ("X_v1_test - ",X_v1_test.shape)
print ("y_v1_train - ",y_v1_train.shape)
print ("y_v1_test - ",y_v1_test.shape)
print ("X_v1_valid - ",X_v1_valid.shape)
print ("y_v1_valid - ",y_v1_valid.shape)
# Apply The Full Featured Classifier To The Test Data
prediction_train = clf.predict(X_v1_train)
ac_train = accuracy_score(y_v1_train, prediction_train)
print ("accuracy score on training set- ",ac_train)
prediction = clf.predict(X_v1_test)
ac = accuracy_score(y_test, prediction)
print ("accuracy score on test set- ",ac)
if X_v1_valid is not None :
prediction_valid = clf.predict(X_v1_valid)
ac_valid = accuracy_score(y_v1_valid, prediction_valid)
print ("accuracy score on validation set- ",ac_valid)
labels_names = clf.classes_
target_names = clf.classes_
#print(classification_report(y_true, y_pred,labels=labels_names, target_names=target_names))
#print(classification_report(X_v1_train, y_pred,labels=labels_names, target_names=target_names))
# label should be binary (-1,1) or (0,1)
# returns fpr, tpr, threshold
fpr_train, tpr_train, t_train = metrics.roc_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
fpr_test, tpr_test, t_test = metrics.roc_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
if X_v1_valid is not None :
fpr_valid, tpr_valid, t_valid = metrics.roc_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
fig = go.Figure()
fig.update_layout(legend=dict(
yanchor="bottom",
y=0,
xanchor="right",
x=1
))
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.title = 'ROC Curve'
fig.layout.xaxis.title = 'FPR'
fig.layout.yaxis.title = 'TRP (Sensitivity)'
fig.add_trace(go.Scatter(
x=fpr_train
,y=tpr_train
,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
))
fig.add_trace(go.Scatter(
x=fpr_test
,y=tpr_test
,name=f'Test AUC:{metrics.auc(fpr_test,tpr_test):,.03f}'
))
fig.show()
# Added new
fig = go.Figure()
fig.update_layout(legend=dict(
yanchor="bottom",
y=0,
xanchor="right",
x=1
))
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.title = 'ROC Curve on validation'
fig.layout.xaxis.title = 'FPR'
fig.layout.yaxis.title = 'TRP (Sensitivity)'
fig.add_trace(go.Scatter(
x=fpr_train
,y=tpr_train
,name=f'Train AUC:{metrics.auc(fpr_train,tpr_train):,.03f}'
))
fig.add_trace(go.Scatter(
x=fpr_valid
,y=tpr_valid
,name=f'Validation AUC:{metrics.auc(fpr_valid,tpr_valid):,.03f}'
))
fig.show()
precision_train, recall_train, thresholds_train = metrics.precision_recall_curve(y_train, [x[1] for x in pipeline.predict_proba(X_v1_train)])
precision_test, recall_test, thresholds_test = metrics.precision_recall_curve(y_test, [x[1] for x in pipeline.predict_proba(X_v1_test)])
if X_v1_valid is not None :
precision_valid, recall_valid, thresholds_valid = metrics.precision_recall_curve(y_valid, [x[1] for x in pipeline.predict_proba(X_v1_valid)])
fig = go.Figure()
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.xaxis.title = 'Recall'
fig.layout.yaxis.title = 'Precision'
fig.layout.title = 'PR Curve on Test'
fig.add_trace(go.Scatter(
x=recall_train
,y=precision_train
,name='Train'
))
fig.add_trace(go.Scatter(
x=recall_test
,y=precision_test
,name='Test'
))
fig.show()
fig = go.Figure()
fig.layout.height = 500
fig.layout.width = 500
fig.layout.xaxis.range = [0,1]
fig.layout.yaxis.range = [0,1]
fig.layout.xaxis.title = 'Recall'
fig.layout.yaxis.title = 'Precision'
fig.layout.title = 'PR Curve on Validation'
fig.add_trace(go.Scatter(
x=recall_train
,y=precision_train
,name='Train'
))
fig.add_trace(go.Scatter(
x=recall_valid
,y=precision_valid
,name='Valid'
))
fig.show()
"""
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
color = 'white'
matrix = plot_confusion_matrix(knn, X_test, y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
"""
cm = metrics.confusion_matrix(y_test,pipeline.predict(X_v1_test))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
cm_valid = metrics.confusion_matrix(y_valid,pipeline.predict(X_v1_valid))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm_valid, display_labels=clf.classes_)
disp.plot()
plt.show()
"""
print ("call 2nd confusion metrics")
labels=clf.classes_
print ("labels=", labels)
plot_confusion_metric(y_v1_test, prediction, labels)
"""
In [12]:
train_test_val_option = 1
Option 1 (2017-2020)¶In [13]:
# 42777,443 from ss7bdat
# 12918, 444 from csv
#. Read from file
#tdf = read_adhd_data()
# Read from Memory
if train_test_val_option == 1:
tdf = combined_all
output_file_name = 'combined_17_20.csv'
elif train_test_val_option == 2:
tdf = combined_17_19
output_file_name = 'combined_17_19.csv'
elif train_test_val_option ==3:
tdf = combined_17_18
output_file_name = 'combined_17_18.csv'
Option 2 (2017-2019), 2020¶In [14]:
# option 1 - 124339, 2 - 81562 3- 59963
tdf.shape
Out[14]:
(124339, 470) In [15]:
tdf.head()
Out[15]:
5 rows × 470 columns Data Prep and cleaning¶In [16]:
df_train = feature_subset_new(tdf)
/var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:9: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:22: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:24: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:25: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:26: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:27: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:28: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:29: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:30: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:31: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:32: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:35: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:36: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:38: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:39: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:40: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:41: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` /var/folders/rz/hfhqc4_13wb5y67t0275_mmr0000gn/T/ipykernel_3956/2886649691.py:42: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` In [17]:
df_train.shape
Out[17]:
(124339, 488) In [18]:
# 124339 124339 rows 2. 81562 3. 59963 to 59963
tdf2 = remove_missing(df_train)
In [19]:
tdf2.shape
Out[19]:
(124339, 488) In [20]:
# 1 124339 to 37206. changing 14582*488. tp. 24288*461 3. 59963 to 18099
#tdf3 = downsample(tdf2)
#tdf3.shape
#tdf = tdf3
In [21]:
tdf = tdf2
In [22]:
#df_train = feature_subset(tdf)
# 2017-2019
#df_train = feature_subset_new(tdf)
In [23]:
df_train = feature_subset_new(tdf)
In [24]:
df_train.shape
Out[24]:
(124339, 488) In [25]:
df_train_20 = feature_subset_new(tdf20)
df_train_20 = tdf20[feature_cols+['target']].dropna()
df_train_20 = df_train_20[~df_train_20['target'].isnull()].copy()
df_train_19 = feature_subset_new(tdf19)
df_train_19 = tdf19[feature_cols+['target']].dropna()
df_train_19 = df_train_19[~df_train_19['target'].isnull()].copy()
In [26]:
# original has 19 columns
# adhd-20-all has 18 columns. removed brain injury
df_train = tdf[feature_cols+['target']].dropna()
df_train = df_train[~df_train['target'].isnull()].copy()
df_train.head(1000)
Out[26]:
1000 rows × 18 columns In [27]:
#. option 2. reduced from 81562 to 80906
df_train.shape
Out[27]:
(123495, 18) Write file¶In [28]:
df_train.to_csv(output_file_name)
df_train_20.to_csv('only_20.csv')
df_train_19.to_csv('only_19.csv')
In [29]:
y = df_train['target']
#X =df_train[['Anxiety']] # need 2d array, double bracket
X = df_train[feature_cols]
#X_train, X_test, y_train, y_test = train_test_split(column_trans, y, test_size=0.1)
#clf=RandomForestClassifier(n_estimators=200,min_samples_split=20,min_samples_leaf=10,max_depth=3, class_weight={0:0.10, 1:0.90})
#clf.fit(X_train,y_train)
In [30]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
In [31]:
import pandas_profiling as pp
pp.ProfileReport(df_train)
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s] Generate report structure: 0%| | 0/1 [00:00<?, ?it/s] Render HTML: 0%| | 0/1 [00:00<?, ?it/s] |