# %% - - visual studio code cell - - import numpy as np import pandas as pd from sklearn import linear_model # %% - - visual studio code cell - - # data generated by our cleaning process temp_data = pd.read_csv('cleaned_data.csv') C = temp_data.drop(columns = ["breakfast", "lunch", "dinner"]) C = C.corr() C[C == 1.0] = 0.0 # disregard 1s on diagonals # see how many variable pairs meet our criteria (C > 0.85).sum().sum() # result = 6 (C < -0.85).sum().sum() # result = 0 high_corr = C[(C > 0.85).sum() == 1] row_names = list(high_corr.index) # sub-matrix consisting of only > 0.85 correlation high_corr[row_names] # %% - - visual studio code cell - - data = pd.read_csv('cleaned_data.csv') # csv produced by data cleaning sections of article x = data.iloc[:, 1:] # make a copy and get rid of string labels in 1st column x = x.drop(columns=["breakfast", "lunch", "dinner", "fat", "portland", "non-alcoholic"]) label = "breakfast" # select "breakfast", "lunch" or "dinner" y = pd.DataFrame(data, columns=[label]) y = y.values.ravel() # turn into numpy array, flatten to 1D for sklearn # %% - - visual studio code cell - - # set regularization (penalty) to None in sklearn's logistic reg class # increase max iteration for solver to converge, known issue for well separated dataset logit_b = linear_model.LogisticRegression(max_iter=12000, penalty=None) logit_b.fit(x, y) c = logit_b.coef_[0] # numpy array of coefficients bottom = np.argsort(c) # indices of coefficients in order of size top = np.flip(bottom) # modify so that it is largest to smallest top_b = x.iloc[:, top[0:30]].columns # print out top 30 columns by coefficient size bot_b = x.iloc[:, bottom[0:30]].columns # print out bottom 30 columns by coefficient size p_b = logit_b.predict_proba(x)[:, 1] score_b = logit_b.score(x, y) # %% - - visual studio code cell - -