import numpy as np
import pandas as pd
from sklearn import linear_model
data = pd.read_csv('bld.csv') # csv produced by data cleaning sections of article
x = data.iloc[:, 1:] # make a copy and get rid of string labels in 1st column
x = x.drop(columns=['breakfast', 'lunch', 'dinner'])
# - - - - breakfast - - - -
label = "breakfast"
y = pd.DataFrame(data, columns=[label])
y = y.values.ravel() # turn into numpy array, flatten to 1D for sklearn
# logistic reg class, set regularization (penalty) to none
# fit the model. I had to increase max iteration for solver to converge
# this is a known issue for well separated dataset
logit_b = linear_model.LogisticRegression(max_iter=12000, penalty='none')
logit_b.fit(x, y)
c = logit_b.coef_[0] # numpy array of coefficients
bottom = np.argsort(c) # indices of coefficients in order of size
top = np.flip(bottom) # modify so that it is largest to smallest
top_b = x.iloc[:, top[0:30]].columns # print out top 30 columns by coefficient size
bot_b = x.iloc[:, bottom[0:30]].columns # print out bottom 30 columns by coefficient size
p_b = logit_b.predict_proba(x)[:, 1]
score_b = logit_b.score(x, y)