# %% - - visual studio code cell - - import pandas as pd import matplotlib.pyplot as plt # %% - - visual studio code cell - - # 'original_data.csv' is the raw data downloaded from # https://www.kaggle.com/hugodarwood/epirecipes/ data = pd.read_csv('original_data.csv') # %% - - visual studio code cell - - # check rows with missing values for col in data.columns: if sum(data[col].isnull()) > 0: print(col) # remove rows with missing nutrition values data = data[~data['calories'].isnull()] data = data[~data['protein'].isnull()] data = data[~data['fat'].isnull()] data = data[~data['sodium'].isnull()] # %% - - visual studio code cell - - # plotting nutrition values plt.rcParams.update({'font.size': 14}) plt.subplot(2,2,1) plt.title('calories') plt.ylabel('value') plt.xticks([]) plt.plot(data['calories']) plt.subplot(2,2,2) plt.title('protein') plt.ylabel('value') plt.xticks([]) plt.plot(data['protein']) plt.subplot(2,2,3) plt.title('fat') plt.ylabel('value') plt.xticks([]) plt.plot(data['fat']) plt.subplot(2,2,4) plt.title('sodium') plt.ylabel('value') plt.xticks([]) plt.plot(data['sodium']) plt.show() # %% - - visual studio code cell - - # plot rows with extreme outliers # calories count = [] xticks = [] for i in range(1,10): count.append(sum(data['calories'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,1) plt.title('recipes with calories >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') ax = plt.gca() ax.set_ylim([0,None]) # protein count = [] xticks = [] for i in range(1,10): count.append(sum(data['protein'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,2) plt.title('recipes with protein >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') # fat count = [] xticks = [] for i in range(1,10): count.append(sum(data['fat'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,3) plt.title('recipes with fat >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') # sodium count = [] xticks = [] for i in range(1,10): count.append(sum(data['sodium'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,4) plt.title('recipes with sodium >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') ax = plt.gca() ax.set_ylim([0,None]) plt.show() # %% - - visual studio code cell - - # remove rows with exreme outliers data = data[data['calories'] < 5000] data = data[data['sodium'] < 5000] sum(data['protein'] > 600) # result = 0 sum(data['fat'] > 600) # result = 0 # %% - - visual studio code cell - - # trim data based on breakfast, lunch, dinner labels label_sum = (data['breakfast'] + data['lunch'] + data['dinner']) data = data[(label_sum > 0) & (label_sum < 2)] len(data) # 2460 sum(data['breakfast'] == 1) + sum(data['lunch'] == 1) + sum(data['dinner'] == 1) #2460 # %% - - visual studio code cell - - # drop columns with zero variation zero_var = [] for i in range(2,len(data.columns)): if data.iloc[:,i].var() == 0: zero_var.append(data.iloc[:,i].name) for col in zero_var: data = data.drop(columns=col) # also drop the brunch column data = data.drop(columns="brunch") # index = False leaves out the auto generated "unamed 0" column data.to_csv('cleaned_data.csv', index=False) # save a copy for next step # %% - - visual studio code cell - -