# %% - - visual studio code cell - -


import pandas as pd
import matplotlib.pyplot as plt


# %% - - visual studio code cell - -


# 'original_data.csv' is the raw data downloaded from
# https://www.kaggle.com/hugodarwood/epirecipes/

data = pd.read_csv('original_data.csv')


# %% - - visual studio code cell - -


# check rows with missing values

for col in data.columns:
    if sum(data[col].isnull()) > 0:
        print(col)

# remove rows with missing nutrition values

data = data[~data['calories'].isnull()]
data = data[~data['protein'].isnull()]
data = data[~data['fat'].isnull()]
data = data[~data['sodium'].isnull()]


# %% - - visual studio code cell - -


# plotting nutrition values

plt.rcParams.update({'font.size': 14})

plt.subplot(2,2,1)
plt.title('calories')
plt.ylabel('value')
plt.xticks([])
plt.plot(data['calories'])

plt.subplot(2,2,2)
plt.title('protein')
plt.ylabel('value')
plt.xticks([])
plt.plot(data['protein'])

plt.subplot(2,2,3)
plt.title('fat')
plt.ylabel('value')
plt.xticks([])
plt.plot(data['fat'])

plt.subplot(2,2,4)
plt.title('sodium')
plt.ylabel('value')
plt.xticks([])
plt.plot(data['sodium'])

plt.show()


# %% - - visual studio code cell - -


# plot rows with extreme outliers

# calories

count = []
xticks = []

for i in range(1,10):
    count.append(sum(data['calories'] >= i*1000))
    xticks.append(i*1000)

plt.subplot(2,2,1)
plt.title('recipes with calories >= threshold')
plt.ylabel('count')
plt.xlabel('threshold')
plt.xticks(range(0,9),xticks)
plt.plot(count, '--bo')

ax = plt.gca()
ax.set_ylim([0,None])

# protein

count = []
xticks = []

for i in range(1,10):
    count.append(sum(data['protein'] >= i*1000))
    xticks.append(i*1000)

plt.subplot(2,2,2)
plt.title('recipes with protein >= threshold')
plt.ylabel('count')
plt.xlabel('threshold')
plt.xticks(range(0,9),xticks)
plt.plot(count, '--bo')

# fat

count = []
xticks = []

for i in range(1,10):
    count.append(sum(data['fat'] >= i*1000))
    xticks.append(i*1000)

plt.subplot(2,2,3)
plt.title('recipes with fat >= threshold')
plt.ylabel('count')
plt.xlabel('threshold')
plt.xticks(range(0,9),xticks)
plt.plot(count, '--bo')


# sodium

count = []
xticks = []

for i in range(1,10):
    count.append(sum(data['sodium'] >= i*1000))
    xticks.append(i*1000)

plt.subplot(2,2,4)
plt.title('recipes with sodium >= threshold')
plt.ylabel('count')
plt.xlabel('threshold')
plt.xticks(range(0,9),xticks)
plt.plot(count, '--bo')

ax = plt.gca()
ax.set_ylim([0,None])

plt.show()


# %% - - visual studio code cell - -


# remove rows with exreme outliers

data = data[data['calories'] < 5000]
data = data[data['sodium'] < 5000]

sum(data['protein'] > 600) # result = 0
sum(data['fat'] > 600) # result = 0


# %% - - visual studio code cell - -


# trim data based on breakfast, lunch, dinner labels

label_sum = (data['breakfast'] + data['lunch'] + data['dinner'])
data = data[(label_sum > 0) & (label_sum < 2)]

len(data) # 2460
sum(data['breakfast'] == 1) + sum(data['lunch'] == 1) + sum(data['dinner'] == 1) #2460


# %% - - visual studio code cell - -


# drop columns with zero variation 

zero_var = []

for i in range(2,len(data.columns)):
    if data.iloc[:,i].var() == 0:
        zero_var.append(data.iloc[:,i].name)

for col in zero_var:
    data = data.drop(columns=col)

# also drop the brunch column
data = data.drop(columns="brunch")

# index = False leaves out the auto generated "unamed 0" column
data.to_csv('cleaned_data.csv', index=False) # save a copy for next step


# %% - - visual studio code cell - -