!wget -N https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv

--2021-04-01 00:16:40--  https://cdn.freecodecamp.org/project-data/health-costs/insurance.csv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘insurance.csv’

insurance.csv           [ <=>                ]  49.09K  --.-KB/s    in 0.05s

Last-modified header missing -- time-stamps turned off.
2021-04-01 00:16:40 (1.02 MB/s) - ‘insurance.csv’ saved [50264]


!pip install -q git+https://github.com/tensorflow/docs

WARNING: You are using pip version 20.2.4; however, version 21.0.1 is available.
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling


dataset = pd.read_csv('insurance.csv')
len(dataset)

1338


dataset.head()


df = dataset
df["sex"] = pd.factorize(df["sex"])[0]
df["region"] = pd.factorize(df["region"])[0]
df["smoker"] = pd.factorize(df["smoker"])[0]
dataset = df
dataset.head()


test_dataset = dataset.sample(frac=0.2)
len(test_dataset)

268


train_dataset = dataset[~dataset.isin(test_dataset)].dropna()
len(train_dataset)

1070


train_dataset.head()


train_labels = train_dataset.pop("expenses")
train_labels.head()

0    16884.92
1     1725.55
3    21984.47
4     3866.86
5     3756.62
Name: expenses, dtype: float64


train_dataset.head()


test_labels = test_dataset.pop("expenses")
test_labels.head()

261     17085.27
1311     4571.41
1271     3021.81
775     10560.49
1167     4529.48
Name: expenses, dtype: float64


test_dataset.head()


normalizer = layers.experimental.preprocessing.Normalization()
normalizer.adapt(np.array(train_dataset))

model = keras.Sequential([
    normalizer,
    layers.Dense(16),
    layers.Dense(4),
    layers.Dropout(.2),
    layers.Dense(1),
])


model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=0.1),
    loss='mae',
    metrics=['mae', 'mse']
)
model.build()
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
normalization_3 (Normalizati (None, 6)                 13
_________________________________________________________________
dense_9 (Dense)              (None, 16)                112
_________________________________________________________________
dense_10 (Dense)             (None, 4)                 68
_________________________________________________________________
dropout_3 (Dropout)          (None, 4)                 0
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 5
=================================================================
Total params: 198
Trainable params: 185
Non-trainable params: 13
_________________________________________________________________


history = model.fit(
    train_dataset,
    train_labels,
    epochs=100,
    validation_split=0.5,
    verbose=0, # disable logging
)

print(history)

<tensorflow.python.keras.callbacks.History object at 0x7fe93c25dda0>


# RUN THIS CELL TO TEST YOUR MODEL. DO NOT MODIFY CONTENTS.
# Test model by checking how well the model generalizes using the test set.
loss, mae, mse = model.evaluate(test_dataset, test_labels, verbose=2)

print("Testing set Mean Abs Error: {:5.2f} expenses".format(mae))

if mae < 3500:
  print("You passed the challenge. Great job!")
else:
  print("The Mean Abs Error must be less than 3500. Keep trying.")

# Plot predictions.
test_predictions = model.predict(test_dataset).flatten()

a = plt.axes(aspect='equal')
plt.scatter(test_labels, test_predictions)
plt.xlabel('True values (expenses)')
plt.ylabel('Predictions (expenses)')
lims = [0, 50000]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims,lims)

9/9 - 0s - loss: 2933.1277 - mae: 2933.1277 - mse: 29813978.0000
Testing set Mean Abs Error: 2933.13 expenses
You passed the challenge. Great job!

	age	sex	bmi	children	smoker	region	expenses
0	19	female	27.9	0	yes	southwest	16884.92
1	18	male	33.8	1	no	southeast	1725.55
2	28	male	33.0	3	no	southeast	4449.46
3	33	male	22.7	0	no	northwest	21984.47
4	32	male	28.9	0	no	northwest	3866.86

Linear Regression Health Costs Calculator¶

Note¶

Problem description¶

Solution¶

Download data¶

Install tensorflow docs¶

Import libraries¶

Prepare datasets¶

Prepare the labels¶

Prepare the model¶

Test¶

	age	sex	bmi	children	smoker	region	expenses
0	19.0	0.0	27.9	0.0	0.0	0.0	16884.92
1	18.0	1.0	33.8	1.0	1.0	1.0	1725.55
3	33.0	1.0	22.7	0.0	1.0	2.0	21984.47
4	32.0	1.0	28.9	0.0	1.0	2.0	3866.86
5	31.0	0.0	25.7	0.0	1.0	1.0	3756.62

	age	sex	bmi	children	smoker	region
261	20	0	26.8	1	0	1
1311	33	0	26.7	0	1	2
1271	25	0	34.5	0	1	2
775	51	1	33.3	3	1	1
1167	29	0	24.6	2	1	0