import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

dataset=pd.read_csv("diabetes.csv")

dataset.head()

dataset["Age"].max()

81

dataset.shape

(768, 9)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

dataset.describe()

dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

dataset['Age'].median()

29.0

dataset.loc[dataset['Age']>=29, 'Age_mod']=0
dataset.loc[dataset['Age']< 29, 'Age_mod']=1

dataset.head()

dataset['Age_mod'].value_counts()

Age_mod
0.0    401
1.0    367
Name: count, dtype: int64

dataset.corr()

sns.pairplot(dataset)

C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

<seaborn.axisgrid.PairGrid at 0x21f2d37dcd0>

dataset.head()

dataset.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'],
      dtype='object')

def graph(feature2, color):

    print(plt.bar(dataset['Outcome'], dataset[feature2], color= color))
    print(plt.xlabel('Outcome'))
    print(plt.ylabel(feature2))

graph('Pregnancies', 'purple')

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'Pregnancies')

graph('Glucose', 'grey')

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'Glucose')

a=dataset.columns

graph('BloodPressure', 'orange')

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'BloodPressure')

a

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'],
      dtype='object')

graph('SkinThickness', 'brown')

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'SkinThickness')

a

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'],
      dtype='object')

graph('Insulin', 'green')

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'Insulin')

a

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod'],
      dtype='object')

graph('BMI', 'magenta');

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'BMI')

graph('DiabetesPedigreeFunction', 'yellow');

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'DiabetesPedigreeFunction')

graph('Age', 'magenta');

<BarContainer object of 768 artists>
Text(0.5, 0, 'Outcome')
Text(0, 0.5, 'Age')

sns.distplot(dataset['Pregnancies'])

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\951228210.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['Pregnancies'])
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

<Axes: xlabel='Pregnancies', ylabel='Density'>

plt.boxplot(dataset['Pregnancies']);

IQR= dataset['Pregnancies'].quantile(0.75)-dataset['Pregnancies'].quantile(0.25)
IQR

5.0

upper_preg=dataset['Pregnancies'].quantile(0.75)+(IQR*3)
upper_preg

21.0

dataset['Pregnancies'].nunique()

17

dataset['Pregnancies'].unique()

array([ 6,  1,  8,  0,  5,  3, 10,  2,  4,  7,  9, 11, 13, 15, 17, 12, 14],
      dtype=int64)

dataset['Pregnancies'].value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: count, dtype: int64

50+45+38+28+24+11+10+9+2+2

219

11+10+9+2+2

34

768-219

549

dataset.shape

(768, 10)

(219*100)/768

28.515625

(34*100)/768

4.427083333333333

dataset['Pregnancies_mod']=dataset['Pregnancies']

dataset.loc[dataset['Pregnancies']> 10, 'Pregnancies_mod']=5

dataset['Pregnancies'].max()

17

dataset['Pregnancies_mod'].max()

10

dataset.head()

sns.distplot(dataset['Glucose']);

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\3413697017.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['Glucose']);
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

plt.boxplot(dataset['Glucose']);

IQR= dataset['Glucose'].quantile(0.75)-dataset['Glucose'].quantile(0.25)
IQR

41.25

lower_glucose=dataset['Glucose'].quantile(0.25)-(IQR*3)
lower_glucose

-24.75

dataset['Glucose'].min()

0

dataset['Glucose'].max()

199

dataset['Glucose_mod']=dataset['Glucose']

dataset.loc[dataset['Glucose']<20, 'Glucose_mod']=20

dataset.head(10)

dataset['Glucose_mod'].min()

20

sns.distplot(dataset['BloodPressure'])

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\2509938823.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['BloodPressure'])
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

<Axes: xlabel='BloodPressure', ylabel='Density'>

plt.boxplot(dataset['BloodPressure']);

dataset['BloodPressure'].nunique()

47

dataset['BloodPressure'].min()

0

dataset['BloodPressure'].max()

122

IQR= dataset['BloodPressure'].quantile(0.75)-dataset['BloodPressure'].quantile(0.25)
IQR

18.0

lower_bp=dataset['BloodPressure'].quantile(0.25)-(IQR*3)
lower_bp

8.0

upper_bp=dataset['BloodPressure'].quantile(0.75)+(IQR*3)
upper_bp

134.0

dataset['BloodPressure_mod']=dataset['BloodPressure']

dataset.loc[dataset['BloodPressure']<40, 'BloodPressure_mod']=40

dataset.head()

import matplotlib.pyplot as plt
import seaborn as sns

sns.distplot(dataset['SkinThickness']);

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\1550414908.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['SkinThickness']);
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

plt.boxplot(dataset['SkinThickness']);

dataset['SkinThickness'].max()

99

dataset['SkinThickness'].min()

0

dataset['SkinThickness_mod']=dataset['SkinThickness']

dataset.loc[dataset['SkinThickness']<2, 'SkinThickness_mod']=2

IQ1R= dataset['SkinThickness'].quantile(0.75)-dataset['SkinThickness'].quantile(0.25)
IQ1R

32.0

upperst=dataset['SkinThickness'].quantile(0.75)+(IQ1R*3)
upperst

128.0

dataset.head()

sns.distplot(dataset['Insulin']);

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\681389340.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['Insulin']);
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

plt.boxplot(dataset['Insulin']);

dataset['Insulin'].min()

0

dataset['Insulin'].max()

846

dataset['Insulin_mod']=dataset['Insulin']

dataset.loc[dataset['Insulin']<23, 'Insulin_mod']=23

IQ1R= dataset['Insulin'].quantile(0.75)-dataset['Insulin'].quantile(0.25)
IQ1R

127.25

upper_IQR=dataset['Insulin'].quantile(0.75)+(IQ1R*3)
upper_IQR

509.0

dataset.loc[dataset['Insulin']>509, 'Insulin_mod']=509

dataset['Insulin_mod'].min()

23

dataset['Insulin_mod'].max()

509

dataset['Insulin_mod'].max()

509

dataset.head(10)

sns.distplot(dataset['BMI']);

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\3186949395.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['BMI']);
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

dataset['BMI'].describe()

count    768.000000
mean      31.992578
std        7.884160
min        0.000000
25%       27.300000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64

plt.boxplot(dataset['BMI']);

IQ1R= dataset['BMI'].quantile(0.75)-dataset['BMI'].quantile(0.25)
IQ1R

9.3

upper_BMI=dataset['BMI'].quantile(0.75)+(IQ1R*3)
upper_BMI

64.5

dataset['BMI_mod']=dataset['BMI']

dataset.loc[dataset['BMI']<15, 'BMI_mod']=15
dataset.loc[dataset['BMI']>64.5, 'BMI_mod']=64.5

print("BMI:")
print(dataset['BMI'].describe())
print('------------------------------------------------------')
print("BMI_mod:")
print(dataset['BMI_mod'].describe())

BMI:
count    768.000000
mean      31.992578
std        7.884160
min        0.000000
25%       27.300000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64
------------------------------------------------------
BMI_mod:
count    768.000000
mean      32.204036
std        7.165761
min       15.000000
25%       27.300000
50%       32.000000
75%       36.600000
max       64.500000
Name: BMI_mod, dtype: float64

dataset.head()

sns.distplot(dataset['DiabetesPedigreeFunction']);

C:\Users\kiran\AppData\Local\Temp\ipykernel_25516\471758895.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(dataset['DiabetesPedigreeFunction']);
C:\Users\kiran\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

plt.boxplot(dataset['DiabetesPedigreeFunction']);

dataset['DiabetesPedigreeFunction'].describe()

count    768.000000
mean       0.471876
std        0.331329
min        0.078000
25%        0.243750
50%        0.372500
75%        0.626250
max        2.420000
Name: DiabetesPedigreeFunction, dtype: float64

dataset['DiabetesPedigreeFunction'].nunique()

517

dataset['DiabetesPedigreeFunction'].unique()

array([0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158,
       0.232, 0.191, 0.537, 1.441, 0.398, 0.587, 0.484, 0.551, 0.254,
       0.183, 0.529, 0.704, 0.388, 0.451, 0.263, 0.205, 0.257, 0.487,
       0.245, 0.337, 0.546, 0.851, 0.267, 0.188, 0.512, 0.966, 0.42 ,
       0.665, 0.503, 1.39 , 0.271, 0.696, 0.235, 0.721, 0.294, 1.893,
       0.564, 0.586, 0.344, 0.305, 0.491, 0.526, 0.342, 0.467, 0.718,
       0.962, 1.781, 0.173, 0.304, 0.27 , 0.699, 0.258, 0.203, 0.855,
       0.845, 0.334, 0.189, 0.867, 0.411, 0.583, 0.231, 0.396, 0.14 ,
       0.391, 0.37 , 0.307, 0.102, 0.767, 0.237, 0.227, 0.698, 0.178,
       0.324, 0.153, 0.165, 0.443, 0.261, 0.277, 0.761, 0.255, 0.13 ,
       0.323, 0.356, 0.325, 1.222, 0.179, 0.262, 0.283, 0.93 , 0.801,
       0.207, 0.287, 0.336, 0.247, 0.199, 0.543, 0.192, 0.588, 0.539,
       0.22 , 0.654, 0.223, 0.759, 0.26 , 0.404, 0.186, 0.278, 0.496,
       0.452, 0.403, 0.741, 0.361, 1.114, 0.457, 0.647, 0.088, 0.597,
       0.532, 0.703, 0.159, 0.268, 0.286, 0.318, 0.272, 0.572, 0.096,
       1.4  , 0.218, 0.085, 0.399, 0.432, 1.189, 0.687, 0.137, 0.637,
       0.833, 0.229, 0.817, 0.204, 0.368, 0.743, 0.722, 0.256, 0.709,
       0.471, 0.495, 0.18 , 0.542, 0.773, 0.678, 0.719, 0.382, 0.319,
       0.19 , 0.956, 0.084, 0.725, 0.299, 0.244, 0.745, 0.615, 1.321,
       0.64 , 0.142, 0.374, 0.383, 0.578, 0.136, 0.395, 0.187, 0.905,
       0.15 , 0.874, 0.236, 0.787, 0.407, 0.605, 0.151, 0.289, 0.355,
       0.29 , 0.375, 0.164, 0.431, 0.742, 0.514, 0.464, 1.224, 1.072,
       0.805, 0.209, 0.666, 0.101, 0.198, 0.652, 2.329, 0.089, 0.645,
       0.238, 0.394, 0.293, 0.479, 0.686, 0.831, 0.582, 0.446, 0.402,
       1.318, 0.329, 1.213, 0.427, 0.282, 0.143, 0.38 , 0.284, 0.249,
       0.926, 0.557, 0.092, 0.655, 1.353, 0.612, 0.2  , 0.226, 0.997,
       0.933, 1.101, 0.078, 0.24 , 1.136, 0.128, 0.422, 0.251, 0.677,
       0.296, 0.454, 0.744, 0.881, 0.28 , 0.259, 0.619, 0.808, 0.34 ,
       0.434, 0.757, 0.613, 0.692, 0.52 , 0.412, 0.84 , 0.839, 0.156,
       0.215, 0.326, 1.391, 0.875, 0.313, 0.433, 0.626, 1.127, 0.315,
       0.345, 0.129, 0.527, 0.197, 0.731, 0.148, 0.123, 0.127, 0.122,
       1.476, 0.166, 0.932, 0.343, 0.893, 0.331, 0.472, 0.673, 0.389,
       0.485, 0.349, 0.279, 0.346, 0.252, 0.243, 0.58 , 0.559, 0.302,
       0.569, 0.378, 0.385, 0.499, 0.306, 0.234, 2.137, 1.731, 0.545,
       0.225, 0.816, 0.528, 0.509, 1.021, 0.821, 0.947, 1.268, 0.221,
       0.66 , 0.239, 0.949, 0.444, 0.463, 0.803, 1.6  , 0.944, 0.196,
       0.241, 0.161, 0.135, 0.376, 1.191, 0.702, 0.674, 1.076, 0.534,
       1.095, 0.554, 0.624, 0.219, 0.507, 0.561, 0.421, 0.516, 0.264,
       0.328, 0.233, 0.108, 1.138, 0.147, 0.727, 0.435, 0.497, 0.23 ,
       0.955, 2.42 , 0.658, 0.33 , 0.51 , 0.285, 0.415, 0.381, 0.832,
       0.498, 0.212, 0.364, 1.001, 0.46 , 0.733, 0.416, 0.705, 1.022,
       0.269, 0.6  , 0.571, 0.607, 0.17 , 0.21 , 0.126, 0.711, 0.466,
       0.162, 0.419, 0.63 , 0.365, 0.536, 1.159, 0.629, 0.292, 0.145,
       1.144, 0.174, 0.547, 0.163, 0.738, 0.314, 0.968, 0.409, 0.297,
       0.525, 0.154, 0.771, 0.107, 0.493, 0.717, 0.917, 0.501, 1.251,
       0.735, 0.804, 0.661, 0.549, 0.825, 0.423, 1.034, 0.16 , 0.341,
       0.68 , 0.591, 0.3  , 0.121, 0.502, 0.401, 0.601, 0.748, 0.338,
       0.43 , 0.892, 0.813, 0.693, 0.575, 0.371, 0.206, 0.417, 1.154,
       0.925, 0.175, 1.699, 0.682, 0.194, 0.4  , 0.1  , 1.258, 0.482,
       0.138, 0.593, 0.878, 0.157, 1.282, 0.141, 0.246, 1.698, 1.461,
       0.347, 0.362, 0.393, 0.144, 0.732, 0.115, 0.465, 0.649, 0.871,
       0.149, 0.695, 0.303, 0.61 , 0.73 , 0.447, 0.455, 0.133, 0.155,
       1.162, 1.292, 0.182, 1.394, 0.217, 0.631, 0.88 , 0.614, 0.332,
       0.366, 0.181, 0.828, 0.335, 0.856, 0.886, 0.439, 0.253, 0.598,
       0.904, 0.483, 0.565, 0.118, 0.177, 0.176, 0.295, 0.441, 0.352,
       0.826, 0.97 , 0.595, 0.317, 0.265, 0.646, 0.426, 0.56 , 0.515,
       0.453, 0.785, 0.734, 1.174, 0.488, 0.358, 1.096, 0.408, 1.182,
       0.222, 1.057, 0.766, 0.171])

IQ1R= dataset['DiabetesPedigreeFunction'].quantile(0.75)-dataset['DiabetesPedigreeFunction'].quantile(0.25)
IQ1R

0.38249999999999995

upper_DPF=dataset['DiabetesPedigreeFunction'].quantile(0.75)+(IQ1R*3)
upper_DPF

1.77375

dataset['DiabetesPedigreeFunction_mod']=dataset['DiabetesPedigreeFunction']

dataset.loc[dataset['DiabetesPedigreeFunction']>1.77375, 'DiabetesPedigreeFunction_mod']=1.77375

dataset.head()

dataset.shape

(768, 17)

dataset.describe()

dataset.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Age_mod',
       'Pregnancies_mod', 'Glucose_mod', 'BloodPressure_mod',
       'SkinThickness_mod', 'Insulin_mod', 'BMI_mod',
       'DiabetesPedigreeFunction_mod'],
      dtype='object')

col=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
for i in col:
    dataset.drop([i], axis=1, inplace =
                 True)

dataset.head()

col=dataset.columns

for i in col:
    print(i,'--->', dataset[i].min())
    
    print(i,'--->', dataset[i].max())
    print("---------------------------------------------")

Outcome ---> 0
Outcome ---> 1
---------------------------------------------
Age_mod ---> 0.0
Age_mod ---> 1.0
---------------------------------------------
Pregnancies_mod ---> 0
Pregnancies_mod ---> 10
---------------------------------------------
Glucose_mod ---> 20
Glucose_mod ---> 199
---------------------------------------------
BloodPressure_mod ---> 40
BloodPressure_mod ---> 122
---------------------------------------------
SkinThickness_mod ---> 2
SkinThickness_mod ---> 99
---------------------------------------------
Insulin_mod ---> 23
Insulin_mod ---> 509
---------------------------------------------
BMI_mod ---> 15.0
BMI_mod ---> 64.5
---------------------------------------------
DiabetesPedigreeFunction_mod ---> 0.078
DiabetesPedigreeFunction_mod ---> 1.77375
---------------------------------------------

X=dataset.drop('Outcome', axis=1)
Y=dataset['Outcome']

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test =train_test_split(X,Y,test_size=0.2,random_state=0)

lr=LogisticRegression()

%%time
lr.fit(X_train,Y_train)

CPU times: total: 31.2 ms
Wall time: 102 ms

C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

LogisticRegression()

LogisticRegression()

pred_lr= lr.predict(X_test)

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

accuracy_lr=accuracy_score(Y_test, pred_lr)
print("Accuracy: %.2f%%" % (accuracy_lr * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_lr))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_lr))

Accuracy: 79.87%
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       107
           1       0.70      0.60      0.64        47

    accuracy                           0.80       154
   macro avg       0.77      0.74      0.75       154
weighted avg       0.79      0.80      0.79       154

-----------------------------------------------
[[95 12]
 [19 28]]

from sklearn.metrics import roc_auc_score

# auc scores
auc_score1 = roc_auc_score(Y_test, pred_lr)


print(auc_score1*100)

74.17975740703918

param_grid= [{'penalty': ['l1', 'l2', 'elasticnet', 'none'],
             'C': np.logspace(-4,4,20),
             'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
             'max_iter': [100,500,1000,1500,2000,2500,5000]}]

from sklearn.model_selection import GridSearchCV

clf= GridSearchCV(lr, param_grid= param_grid, cv=3, verbose=True, n_jobs=-1)

%%time
best_clf= clf.fit(X_train, Y_train)

Fitting 3 folds for each of 2800 candidates, totalling 8400 fits
CPU times: total: 8.03 s
Wall time: 1min 25s

C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning: 
3780 fits failed out of a total of 8400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got l1 penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 64, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'none' penalties, got elasticnet penalty.

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1291, in fit
    fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 1048, in __call__
    if self.dispatch_one_batch(iterator):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 864, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 782, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
             ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 572, in __init__
    self.results = batch()
                   ^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 521, in _logistic_regression_path
    alpha = (1.0 / C) * (1 - l1_ratio)
                         ~~^~~~~~~~~~
TypeError: unsupported operand type(s) for -: 'int' and 'NoneType'

--------------------------------------------------------------------------------
420 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 71, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none' is not supported for the liblinear solver

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [       nan        nan 0.64006855 ...        nan 0.73283118 0.74261916]
  warnings.warn(

best_clf.best_estimator_

LogisticRegression(C=0.23357214690901212, solver='newton-cg')

LogisticRegression(C=0.23357214690901212, solver='newton-cg')

best_clf.best_params_

{'C': 0.23357214690901212,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'newton-cg'}

lr2=LogisticRegression(C= 4.281332398719396,penalty='l1',max_iter=100,solver= 'liblinear' )

lr2.fit(X_train,Y_train)

LogisticRegression(C=4.281332398719396, penalty='l1', solver='liblinear')

LogisticRegression(C=4.281332398719396, penalty='l1', solver='liblinear')

pred_lr2= lr2.predict(X_test)

accuracy_lr2=accuracy_score(Y_test, pred_lr2)
print("Accuracy: %.2f%%" % (accuracy_lr2 * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_lr2))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_lr2))

Accuracy: 79.87%
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       107
           1       0.70      0.60      0.64        47

    accuracy                           0.80       154
   macro avg       0.77      0.74      0.75       154
weighted avg       0.79      0.80      0.79       154

-----------------------------------------------
[[95 12]
 [19 28]]

auc_score = roc_auc_score(Y_test, pred_lr2)


print(auc_score)

0.7417975740703917

from sklearn import svm

sVm = svm.SVC()

%%time
sVm.fit(X_train,Y_train)

CPU times: total: 15.6 ms
Wall time: 22 ms

SVC()

SVC()

pred_svm= sVm.predict(X_test)

accuracy_svm=accuracy_score(Y_test, pred_svm)
print("Accuracy: %.2f%%" % (accuracy_svm * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_svm))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_svm))

Accuracy: 79.22%
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.81      0.92      0.86       107
           1       0.73      0.51      0.60        47

    accuracy                           0.79       154
   macro avg       0.77      0.71      0.73       154
weighted avg       0.78      0.79      0.78       154

-----------------------------------------------
[[98  9]
 [23 24]]

 param_grid1=[{'C': [1.0,2.0,0,5.0,6.0,7.0,10.0],
             'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
             'gamma':['scale', 'auto']
             }]

from sklearn.model_selection import RandomizedSearchCV

svm1 = svm.SVC(kernel= 'Linear')

clf_svm_random= RandomizedSearchCV(estimator=svm1,param_distributions=param_grid1,cv=3, verbose=100, n_jobs=-1)

%%time
best_clf_svm= clf_svm_random.fit(X_train, Y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits

---------------------------------------------------------------------------
_RemoteTraceback                          Traceback (most recent call last)
_RemoteTraceback: 
"""
Traceback (most recent call last):
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\externals\loky\process_executor.py", line 436, in _process_worker
    r = call_item()
        ^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\externals\loky\process_executor.py", line 288, in __call__
    return self.fn(*self.args, **self.kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in __call__
    return [func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\joblib\parallel.py", line 263, in <listcomp>
    return [func(*args, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 678, in _fit_and_score
    X_train, y_train = _safe_split(estimator, X, y, train)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kiran\anaconda3\Lib\site-packages\sklearn\utils\metaestimators.py", line 227, in _safe_split
    raise ValueError("X should be a square kernel matrix")
ValueError: X should be a square kernel matrix
"""

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
File <timed exec>:1

File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:874, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    868     results = self._format_results(
    869         all_candidate_params, n_splits, all_out, all_more_results
    870     )
    872     return results
--> 874 self._run_search(evaluate_candidates)
    876 # multimetric is determined here because in the case of a callable
    877 # self.scoring the return type is only known after calling
    878 first_test_score = all_out[0]["test_scores"]

File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:1768, in RandomizedSearchCV._run_search(self, evaluate_candidates)
   1766 def _run_search(self, evaluate_candidates):
   1767     """Search n_iter candidates from param_distributions"""
-> 1768     evaluate_candidates(
   1769         ParameterSampler(
   1770             self.param_distributions, self.n_iter, random_state=self.random_state
   1771         )
   1772     )

File ~\anaconda3\Lib\site-packages\sklearn\model_selection\_search.py:821, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    813 if self.verbose > 0:
    814     print(
    815         "Fitting {0} folds for each of {1} candidates,"
    816         " totalling {2} fits".format(
    817             n_splits, n_candidates, n_candidates * n_splits
    818         )
    819     )
--> 821 out = parallel(
    822     delayed(_fit_and_score)(
    823         clone(base_estimator),
    824         X,
    825         y,
    826         train=train,
    827         test=test,
    828         parameters=parameters,
    829         split_progress=(split_idx, n_splits),
    830         candidate_progress=(cand_idx, n_candidates),
    831         **fit_and_score_kwargs,
    832     )
    833     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    834         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
    835     )
    836 )
    838 if len(out) < 1:
    839     raise ValueError(
    840         "No fits were performed. "
    841         "Was the CV iterator empty? "
    842         "Were there no candidates?"
    843     )

File ~\anaconda3\Lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
     58 config = get_config()
     59 iterable_with_config = (
     60     (_with_config(delayed_func, config), args, kwargs)
     61     for delayed_func, args, kwargs in iterable
     62 )
---> 63 return super().__call__(iterable_with_config)

File ~\anaconda3\Lib\site-packages\joblib\parallel.py:1061, in Parallel.__call__(self, iterable)
   1058     self._iterating = False
   1060 with self._backend.retrieval_context():
-> 1061     self.retrieve()
   1062 # Make sure that we get a last message telling us we are done
   1063 elapsed_time = time.time() - self._start_time

File ~\anaconda3\Lib\site-packages\joblib\parallel.py:938, in Parallel.retrieve(self)
    936 try:
    937     if getattr(self._backend, 'supports_timeout', False):
--> 938         self._output.extend(job.get(timeout=self.timeout))
    939     else:
    940         self._output.extend(job.get())

File ~\anaconda3\Lib\site-packages\joblib\_parallel_backends.py:542, in LokyBackend.wrap_future_result(future, timeout)
    539 """Wrapper for Future.result to implement the same behaviour as
    540 AsyncResults.get from multiprocessing."""
    541 try:
--> 542     return future.result(timeout=timeout)
    543 except CfTimeoutError as e:
    544     raise TimeoutError from e

File ~\anaconda3\Lib\concurrent\futures\_base.py:449, in Future.result(self, timeout)
    447     raise CancelledError()
    448 elif self._state == FINISHED:
--> 449     return self.__get_result()
    451 self._condition.wait(timeout)
    453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File ~\anaconda3\Lib\concurrent\futures\_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

ValueError: X should be a square kernel matrix

X_test.head()

from sklearn import tree

dt = tree.DecisionTreeClassifier()
#clf = clf.fit(X, Y)

param_grid_dt= [{'criterion': ["gini", "entropy"],
             'splitter': ["best", "random"],
             'max_depth': [5,10,20,25,30,50],
             'max_features': [2,4,6,8,10,"auto", "sqrt", "log2"]}]

clf_svm_random= RandomizedSearchCV(estimator=dt,param_distributions=param_grid_dt,cv=5, verbose=100, n_jobs=-1)

%%time
clf_svm_random.fit(X_train, Y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: total: 203 ms
Wall time: 5.82 s

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
                   param_distributions=[{'criterion': ['gini', 'entropy'],
                                         'max_depth': [5, 10, 20, 25, 30, 50],
                                         'max_features': [2, 4, 6, 8, 10,
                                                          'auto', 'sqrt',
                                                          'log2'],
                                         'splitter': ['best', 'random']}],
                   verbose=100)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
                   param_distributions=[{'criterion': ['gini', 'entropy'],
                                         'max_depth': [5, 10, 20, 25, 30, 50],
                                         'max_features': [2, 4, 6, 8, 10,
                                                          'auto', 'sqrt',
                                                          'log2'],
                                         'splitter': ['best', 'random']}],
                   verbose=100)

DecisionTreeClassifier()

DecisionTreeClassifier()

clf_svm_random.best_estimator_

DecisionTreeClassifier(max_depth=5, max_features=10, splitter='random')

DecisionTreeClassifier(max_depth=5, max_features=10, splitter='random')

clf_svm_random.best_params_

{'splitter': 'random', 'max_features': 10, 'max_depth': 5, 'criterion': 'gini'}

%%time
dt1 = tree.DecisionTreeClassifier(criterion='entropy',splitter='best',max_depth=10,max_features='log2')
dtfit = dt1.fit(X_train, Y_train)

CPU times: total: 0 ns
Wall time: 4 ms

pred_dt= dtfit.predict(X_test)

accuracy_dt=accuracy_score(Y_test, pred_dt)
print("Accuracy: %.2f%%" % (accuracy_dt * 100.0))
print("-----------------------------------------------")
print(classification_report(Y_test, pred_dt))
print("-----------------------------------------------")
print(confusion_matrix(Y_test, pred_dt))

Accuracy: 72.73%
-----------------------------------------------
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       107
           1       0.56      0.53      0.54        47

    accuracy                           0.73       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.72      0.73      0.73       154

-----------------------------------------------
[[87 20]
 [22 25]]

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome	Age_mod
Pregnancies	1.000000	0.129459	0.141282	-0.081672	-0.073535	0.017683	-0.033523	0.544341	0.221898	-0.577589
Glucose	0.129459	1.000000	0.152590	0.057328	0.331357	0.221071	0.137337	0.263514	0.466581	-0.246323
BloodPressure	0.141282	0.152590	1.000000	0.207371	0.088933	0.281805	0.041265	0.239528	0.065068	-0.197117
SkinThickness	-0.081672	0.057328	0.207371	1.000000	0.436783	0.392573	0.183928	-0.113970	0.074752	0.089299
Insulin	-0.073535	0.331357	0.088933	0.436783	1.000000	0.197859	0.185071	-0.042163	0.130548	0.035277
BMI	0.017683	0.221071	0.281805	0.392573	0.197859	1.000000	0.140647	0.036242	0.292695	-0.093753
DiabetesPedigreeFunction	-0.033523	0.137337	0.041265	0.183928	0.185071	0.140647	1.000000	0.033561	0.173844	-0.050555
Age	0.544341	0.263514	0.239528	-0.113970	-0.042163	0.036242	0.033561	1.000000	0.238356	-0.756872
Outcome	0.221898	0.466581	0.065068	0.074752	0.130548	0.292695	0.173844	0.238356	1.000000	-0.312100
Age_mod	-0.577589	-0.246323	-0.197117	0.089299	0.035277	-0.093753	-0.050555	-0.756872	-0.312100	1.000000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Outcome	Age_mod	Pregnancies_mod	Glucose_mod	BloodPressure_mod	SkinThickness_mod	Insulin_mod	BMI_mod	DiabetesPedigreeFunction_mod
0	1	0.0	6	148	72	35	23	33.6	0.62700
1	0	0.0	1	85	66	29	23	26.6	0.35100
2	1	0.0	8	183	64	2	23	23.3	0.67200
3	0	1.0	1	89	66	23	94	28.1	0.16700
4	1	0.0	0	137	40	35	168	43.1	1.77375

	Age_mod	Pregnancies_mod	Glucose_mod	BloodPressure_mod	SkinThickness_mod	Insulin_mod	BMI_mod	DiabetesPedigreeFunction_mod
661	1.0	1	199	76	43	23	42.9	1.394
122	1.0	2	107	74	30	100	33.6	0.404
113	1.0	4	76	62	2	23	34.0	0.391
14	0.0	5	166	72	19	175	25.8	0.587
529	0.0	0	111	65	2	23	24.6	0.660

Diabetes prediction using machine learning¶

importing necessary packages¶

reading the dataset and getting getting started with analysis¶

Observation:¶

1) No object type values¶

2) No null values¶

3) there is high standard deviation in some columns¶

4) Almost every feature contains false values¶

Converting Age into categorical variable¶

Correlation¶

Observation:¶

There is very negligible correlation between the features¶

EDA¶

Creating a function to get the barplots¶

Defined a function called graph that outputs the bar graph for Outcome vs desired column from the dataset¶

Pregnancies vs Outcome¶

Observation:¶

The people who have diabetes have approximately 18 kids and those who don't they have a maximum of 13 kids¶

Glucose vs Outcome¶

Observation:¶

The people who have diabetes have slightly more glucose levels¶

BloodPressure vs Outcome¶

Observation:¶

The people who have diabetes have a slight lower blood pressure than those who don't have diabetes¶

SkinThickness vs Outcome¶

Observation:¶

The people who have diabetes have more ski8n thickness (nearly double) than that of a person who has no diabetes¶

Insulin vs Outcome¶

Observation:¶

The people who have diabetes have insulin levels greater than 740 and those who don't have diabetes, they have maximum insulin level of 740¶

BMI vs Outcome¶

Observation:¶

The people who have diabetes, their BMI levels reach the maximum of 680 and those who don't have diabetes, their BMI levels reach a mximumof 570.¶

DiabetesPedigreeFunction vs Outcome¶

Observation:¶

The people who have diabetes, their Diabetes Pedigree Function levels reach the maximum of 2.4 and those who don't have diabetes, their DiabetesPedigreeFunction levels reach a mximumof 2.3.¶

Age vs Outcome¶

Observation:¶

The people who have diabetes, they have a maximum Age of 80 and those who don't have diabetes, their Age is approximately 65.¶

Outliers¶

Pregnancies¶

We capped every value > 10 to 10¶

Glucose¶

We capped the glucose level < 20 to 20¶

Blood Pressure¶

We capped the bp <40 to 40¶

Skin Thickness¶

Insulin¶

BMI¶

Variance is the spread of data wrt to the mean of that column¶

DiabetesPedigreeFunction¶

Model Building¶

Logistic Regression¶

Hyperparameter Tuning¶

SVM¶

Decision Trees¶

Decision Tree¶