import numpy as np import pandas as pd from sklearn import linear_model data = pd.read_csv('bld.csv') # csv produced by data cleaning sections of article x = data.iloc[:, 1:] # make a copy and get rid of string labels in 1st column x = x.drop(columns=['breakfast', 'lunch', 'dinner']) # - - - - breakfast - - - - label = "breakfast" y = pd.DataFrame(data, columns=[label]) y = y.values.ravel() # turn into numpy array, flatten to 1D for sklearn # logistic reg class, set regularization (penalty) to none # fit the model. I had to increase max iteration for solver to converge # this is a known issue for well separated dataset logit_b = linear_model.LogisticRegression(max_iter=12000, penalty='none') logit_b.fit(x, y) c = logit_b.coef_[0] # numpy array of coefficients bottom = np.argsort(c) # indices of coefficients in order of size top = np.flip(bottom) # modify so that it is largest to smallest top_b = x.iloc[:, top[0:30]].columns # print out top 30 columns by coefficient size bot_b = x.iloc[:, bottom[0:30]].columns # print out bottom 30 columns by coefficient size p_b = logit_b.predict_proba(x)[:, 1] score_b = logit_b.score(x, y)