import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df=pd.read_csv("./insurance.csv")
df.head()
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1338 entries, 0 to 1337 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1338 non-null int64 1 sex 1338 non-null object 2 bmi 1338 non-null float64 3 children 1338 non-null int64 4 smoker 1338 non-null object 5 region 1338 non-null object 6 charges 1338 non-null float64 dtypes: float64(2), int64(2), object(3) memory usage: 73.3+ KB
df.describe()
age | bmi | children | charges | |
---|---|---|---|---|
count | 1338.000000 | 1338.000000 | 1338.000000 | 1338.000000 |
mean | 39.207025 | 30.663397 | 1.094918 | 13270.422265 |
std | 14.049960 | 6.098187 | 1.205493 | 12110.011237 |
min | 18.000000 | 15.960000 | 0.000000 | 1121.873900 |
25% | 27.000000 | 26.296250 | 0.000000 | 4740.287150 |
50% | 39.000000 | 30.400000 | 1.000000 | 9382.033000 |
75% | 51.000000 | 34.693750 | 2.000000 | 16639.912515 |
max | 64.000000 | 53.130000 | 5.000000 | 63770.428010 |
df.isna().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
df.isnull().sum()
age 0 sex 0 bmi 0 children 0 smoker 0 region 0 charges 0 dtype: int64
duplicate = df[df.duplicated()]
print(duplicate)
#not dealing with duplicate as this is not a problem
age sex bmi children smoker region charges 581 19 male 30.59 0 no northwest 1639.5631
X=df.iloc[:,:-1]
y=df.iloc[:,-1:]
X
age | sex | bmi | children | smoker | region | |
---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest |
1 | 18 | male | 33.770 | 1 | no | southeast |
2 | 28 | male | 33.000 | 3 | no | southeast |
3 | 33 | male | 22.705 | 0 | no | northwest |
4 | 32 | male | 28.880 | 0 | no | northwest |
... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest |
1334 | 18 | female | 31.920 | 0 | no | northeast |
1335 | 18 | female | 36.850 | 0 | no | southeast |
1336 | 21 | female | 25.800 | 0 | no | southwest |
1337 | 61 | female | 29.070 | 0 | yes | northwest |
1338 rows × 6 columns
y
charges | |
---|---|
0 | 16884.92400 |
1 | 1725.55230 |
2 | 4449.46200 |
3 | 21984.47061 |
4 | 3866.85520 |
... | ... |
1333 | 10600.54830 |
1334 | 2205.98080 |
1335 | 1629.83350 |
1336 | 2007.94500 |
1337 | 29141.36030 |
1338 rows × 1 columns
#LABEL ENCODING
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
X.sex=le.fit_transform(X.sex)
X.smoker=le.fit_transform(X.smoker)
X
age | sex | bmi | children | smoker | region | |
---|---|---|---|---|---|---|
0 | 19 | 0 | 27.900 | 0 | 1 | southwest |
1 | 18 | 1 | 33.770 | 1 | 0 | southeast |
2 | 28 | 1 | 33.000 | 3 | 0 | southeast |
3 | 33 | 1 | 22.705 | 0 | 0 | northwest |
4 | 32 | 1 | 28.880 | 0 | 0 | northwest |
... | ... | ... | ... | ... | ... | ... |
1333 | 50 | 1 | 30.970 | 3 | 0 | northwest |
1334 | 18 | 0 | 31.920 | 0 | 0 | northeast |
1335 | 18 | 0 | 36.850 | 0 | 0 | southeast |
1336 | 21 | 0 | 25.800 | 0 | 0 | southwest |
1337 | 61 | 0 | 29.070 | 0 | 1 | northwest |
1338 rows × 6 columns
X=pd.get_dummies(X,columns=["region"])
X
age | sex | bmi | children | smoker | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|
0 | 19 | 0 | 27.900 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 18 | 1 | 33.770 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 28 | 1 | 33.000 | 3 | 0 | 0 | 0 | 1 | 0 |
3 | 33 | 1 | 22.705 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 32 | 1 | 28.880 | 0 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | 1 | 30.970 | 3 | 0 | 0 | 1 | 0 | 0 |
1334 | 18 | 0 | 31.920 | 0 | 0 | 1 | 0 | 0 | 0 |
1335 | 18 | 0 | 36.850 | 0 | 0 | 0 | 0 | 1 | 0 |
1336 | 21 | 0 | 25.800 | 0 | 0 | 0 | 0 | 0 | 1 |
1337 | 61 | 0 | 29.070 | 0 | 1 | 0 | 1 | 0 | 0 |
1338 rows × 9 columns
plt.figure(figsize=(7,7))
plt.boxplot(X.age)
plt.plot()
[]
plt.figure(figsize=(7,7))
plt.boxplot(X.bmi)
plt.plot()
[]
tenth_percentile = np.percentile(X.bmi, 10)
ninetieth_percentile = np.percentile(X.bmi, 90)
# print(tenth_percentile, ninetieth_percentile)
X.bmi = np.where(X.bmi<tenth_percentile, tenth_percentile,X.bmi)
X.bmi = np.where(X.bmi>ninetieth_percentile, ninetieth_percentile, X.bmi)
# print("Sample:", sample)
print("New array:",X.bmi)
New array: 0 27.90 1 33.77 2 33.00 3 22.99 4 28.88 ... 1333 30.97 1334 31.92 1335 36.85 1336 25.80 1337 29.07 Name: bmi, Length: 1338, dtype: float64
X.head()
age | sex | bmi | children | smoker | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|
0 | 19 | 0 | 27.90 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 18 | 1 | 33.77 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 28 | 1 | 33.00 | 3 | 0 | 0 | 0 | 1 | 0 |
3 | 33 | 1 | 22.99 | 0 | 0 | 0 | 1 | 0 | 0 |
4 | 32 | 1 | 28.88 | 0 | 0 | 0 | 1 | 0 | 0 |
y.head()
charges | |
---|---|
0 | 16884.92400 |
1 | 1725.55230 |
2 | 4449.46200 |
3 | 21984.47061 |
4 | 3866.85520 |
plt.figure(figsize=(7,7))
plt.boxplot(X.bmi)
plt.plot()
[]
plt.figure(figsize=(15, 7))
sns.heatmap(X.corr(), annot=True)
<AxesSubplot:>
sns.displot(df.charges,kde=True,color="c",height=8,aspect=12/8)
plt.show()
plt.figure(figsize=(20,10))
sns.lineplot(x=X['age'],y=y["charges"])
<AxesSubplot:xlabel='age', ylabel='charges'>
plt.figure(figsize=(20,10))
sns.scatterplot(x=X['age'],y=y["charges"])
<AxesSubplot:xlabel='age', ylabel='charges'>
plt.figure(figsize=(20,10))
sns.lineplot(x=X['bmi'],y=y["charges"])
<AxesSubplot:xlabel='bmi', ylabel='charges'>
plt.figure(figsize=(20,10))
sns.scatterplot(x=X['bmi'],y=y["charges"])
<AxesSubplot:xlabel='bmi', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.barplot(x=df['sex'],y=y["charges"])
<AxesSubplot:xlabel='sex', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.violinplot(x=df['sex'],y=y["charges"])
<AxesSubplot:xlabel='sex', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.barplot(x=df['children'],y=y["charges"])
<AxesSubplot:xlabel='children', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.barplot(x=df['smoker'],y=y["charges"])
<AxesSubplot:xlabel='smoker', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.violinplot(x=df['smoker'],y=y["charges"])
<AxesSubplot:xlabel='smoker', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.barplot(x=df['region'],y=y["charges"])
<AxesSubplot:xlabel='region', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.violinplot(x=df['region'],y=y["charges"])
<AxesSubplot:xlabel='region', ylabel='charges'>
plt.figure(figsize=(7,7))
sns.scatterplot(x=df['region'],y=y["charges"])
<AxesSubplot:xlabel='region', ylabel='charges'>
ax = sns.lmplot(x = 'age', y = 'charges', data=df, hue='smoker', palette='Set1')
ax = sns.lmplot(x = 'bmi', y = 'charges', data=df, hue='smoker', palette='Set2')
ax = sns.lmplot(x = 'children', y = 'charges', data=df, hue='smoker', palette='Dark2')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import metrics
Lin_reg = LinearRegression()
Lin_reg.fit(X_train_scaled, y_train)
print(Lin_reg.intercept_)
print(Lin_reg.coef_)
print(Lin_reg.score(X_test_scaled, y_test))
[13193.34164774] [[ 3.58071498e+03 -5.82485842e+01 2.18573544e+03 5.66906175e+02 9.53131007e+03 2.42102428e+17 2.43775374e+17 2.55759880e+17 2.49768909e+17]] 0.7984920471459229
y_pred=Lin_reg.predict(X_test_scaled)
print("R^2 Score -> "+str(metrics.r2_score(y_test,y_pred))
+"\nMean Squared Error -> "+str(metrics.mean_squared_error(y_test,y_pred))
+"\nMean Absolute Error -> "+str(metrics.mean_absolute_error(y_test,y_pred)))
R^2 Score -> 0.7984920471459229 Mean Squared Error -> 32065952.945277948 Mean Absolute Error -> 4031.965074761258
#visualising results
plt.figure(figsize=(10,5))
plt.scatter(y_test,y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual VS Predicted(MLR)')
Text(0.5, 1.0, 'Actual VS Predicted(MLR)')
def degree_poly(degree):
print(degree)
poly_reg = PolynomialFeatures(degree = degree)
x_poly = poly_reg.fit_transform(X_train_scaled)
poly_reg.fit(x_poly, y_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_poly, y_train)
y_pred_poly=lin_reg_2.predict(poly_reg.fit_transform(X_test_scaled))
print('R^2 for degree',degree,"",metrics.r2_score(y_test,y_pred_poly))
#visualising results
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred_poly)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual VS Predicted(PLR)')
for i in range(1,10):
degree_poly(i)
1 R^2 for degree 1 0.7990388259250931 2 R^2 for degree 2 0.8831287400914294 3 R^2 for degree 3 0.8844591971398852 4 R^2 for degree 4 0.8572888475307322 5 R^2 for degree 5 0.3121857609233365 6 R^2 for degree 6 -2.9977829712496532e+16 7 R^2 for degree 7 -6411059025481245.0 8 R^2 for degree 8 -1.2926432048039931e+17 9 R^2 for degree 9 -127012862151386.02
poly_reg = PolynomialFeatures(degree = 3)
x_poly = poly_reg.fit_transform(X_train_scaled)
poly_reg.fit(x_poly, y_train)
lin_reg_2 = LinearRegression()
lin_reg_2.fit(x_poly, y_train)
LinearRegression()
y_pred_poly=lin_reg_2.predict(poly_reg.fit_transform(X_test_scaled))
print("R^2 Score -> "+str(metrics.r2_score(y_test,y_pred_poly))
+"\nMean Squared Error -> "+str(metrics.mean_squared_error(y_test,y_pred_poly))
+"\nMean Absolute Error -> "+str(metrics.mean_absolute_error(y_test,y_pred_poly)))
R^2 Score -> 0.8844591971398852 Mean Squared Error -> 18386003.605798215 Mean Absolute Error -> 2886.9940784664177
#visualising results
plt.figure(figsize=(10,5))
plt.scatter(y_test,y_pred_poly)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual VS Predicted(PLR)')
Text(0.5, 1.0, 'Actual VS Predicted(PLR)')
pred=pd.DataFrame(y_pred,columns=['pred MLR'])
pred["pred PLR"]=y_pred_poly
pred["Actual"]=y_test.values
pred["Actual - MLR"]=y_test.values-y_pred
pred["Actual - PLR"]=y_test.values-y_pred_poly
pred
pred MLR | pred PLR | Actual | Actual - MLR | Actual - PLR | |
---|---|---|---|---|---|
0 | 11209.341648 | 12164.0 | 9724.53000 | -1484.811648 | -2439.47000 |
1 | 9417.341648 | 9948.0 | 8547.69130 | -869.650348 | -1400.30870 |
2 | 38505.341648 | 52580.0 | 45702.02235 | 7196.680702 | -6877.97765 |
3 | 17161.341648 | 15364.0 | 12950.07120 | -4211.270448 | -2413.92880 |
4 | 8105.341648 | 11316.0 | 9644.25250 | 1538.910852 | -1671.74750 |
... | ... | ... | ... | ... | ... |
263 | 15081.341648 | 17424.0 | 15019.76005 | -61.581598 | -2404.23995 |
264 | 8265.341648 | 8516.0 | 6664.68595 | -1600.655698 | -1851.31405 |
265 | 16041.341648 | 11812.0 | 20709.02034 | 4667.678692 | 8897.02034 |
266 | 33641.341648 | 41476.0 | 40932.42950 | 7291.087852 | -543.57050 |
267 | 8713.341648 | 8708.0 | 9500.57305 | 787.231402 | 792.57305 |
268 rows × 5 columns
print('MLR Scores')
print("R^2 Score -> "+str(metrics.r2_score(y_test,y_pred))
+"\nMean Squared Error -> "+str(metrics.mean_squared_error(y_test,y_pred))
+"\nMean Absolute Error -> "+str(metrics.mean_absolute_error(y_test,y_pred)))
MLR Scores R^2 Score -> 0.7984920471459229 Mean Squared Error -> 32065952.945277948 Mean Absolute Error -> 4031.965074761258
print('PLR with degree 3 Scores')
print("R^2 Score -> "+str(metrics.r2_score(y_test,y_pred_poly))
+"\nMean Squared Error -> "+str(metrics.mean_squared_error(y_test,y_pred_poly))
+"\nMean Absolute Error -> "+str(metrics.mean_absolute_error(y_test,y_pred_poly)))
PLR with degree 3 Scores R^2 Score -> 0.8844591971398852 Mean Squared Error -> 18386003.605798215 Mean Absolute Error -> 2886.9940784664177