import pandas as pd import matplotlib.pyplot as plt # 'data_original.csv' is the raw data downloaded from # https://www.kaggle.com/hugodarwood/epirecipes/ data = pd.read_csv('data_original.csv') # - - - remove rows with missing nutrition values - - - data = data[~data['calories'].isnull()] data = data[~data['protein'].isnull()] data = data[~data['fat'].isnull()] data = data[~data['sodium'].isnull()] # - - plotting nutrition values - - ''' plt.rcParams.update({'font.size': 14}) plt.subplot(2,2,1) plt.title('calories') plt.ylabel('value') plt.xticks([]) plt.plot(data['calories']) plt.subplot(2,2,2) plt.title('protein') plt.ylabel('value') plt.xticks([]) plt.plot(data['protein']) plt.subplot(2,2,3) plt.title('fat') plt.ylabel('value') plt.xticks([]) plt.plot(data['fat']) plt.subplot(2,2,4) plt.title('sodium') plt.ylabel('value') plt.xticks([]) plt.plot(data['sodium']) # plt.show() # uncomment to show the plot ''' # - - plot rows with extreme outliers - - ''' # - calories - count = [] xticks = [] for i in range(1,10): count.append(sum(data['calories'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,1) plt.title('recipes with calories >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') ax = plt.gca() ax.set_ylim([0,None]) # - protein - count = [] xticks = [] for i in range(1,10): count.append(sum(data['protein'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,2) plt.title('recipes with protein >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') # - fat - count = [] xticks = [] for i in range(1,10): count.append(sum(data['fat'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,3) plt.title('recipes with fat >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') # - sodium - count = [] xticks = [] for i in range(1,10): count.append(sum(data['sodium'] >= i*1000)) xticks.append(i*1000) plt.subplot(2,2,4) plt.title('recipes with sodium >= threshold') plt.ylabel('count') plt.xlabel('threshold') plt.xticks(range(0,9),xticks) plt.plot(count, '--bo') plt.show() ax = plt.gca() ax.set_ylim([0,None]) ''' # - - remove rows with exreme outliers - - data = data[data['calories'] < 5000] data = data[data['sodium'] < 5000] sum(data['protein']>600) # result = 0 sum(data['fat']>600) # result = 0 # - - trim data based on breakfast, lunch, dinner labels - - label_sum = (data['breakfast'] + data['lunch'] + data['dinner']) data = data[(label_sum > 0) & (label_sum < 2)] len(data) # 2460 sum(data['breakfast']==1) + sum(data['lunch']==1) + sum(data['dinner']==1) #2460 # - - drop columns with zero variation - - zero_var = [] for i in range(2,len(data.columns)): if data.iloc[:,i].var() == 0: zero_var.append(data.iloc[:,i].name) for col in zero_var: data = data.drop(columns=col) # also drop the brunch column data = data.drop(columns="brunch") data.to_csv('bld.csv', index=False) # save a copy for next step # index = False leave out the auto generated "unamed 0" column