# %% - - visual studio code cell - -


import numpy as np
import pandas as pd
from sklearn import linear_model


# %% - - visual studio code cell - -


# data generated by our cleaning process
temp_data = pd.read_csv('cleaned_data.csv')

C = temp_data.drop(columns = ["breakfast", "lunch", "dinner"])
C = C.corr()
C[C == 1.0] = 0.0 # disregard 1s on diagonals

# see how many variable pairs meet our criteria
(C > 0.85).sum().sum() # result = 6
(C < -0.85).sum().sum() # result = 0

high_corr = C[(C > 0.85).sum() == 1]
row_names = list(high_corr.index)

# sub-matrix consisting of only > 0.85 correlation
high_corr[row_names]


# %% - - visual studio code cell - -


data = pd.read_csv('cleaned_data.csv')  # csv produced by data cleaning sections of article

x = data.iloc[:, 1:] # make a copy and get rid of string labels in 1st column
x = x.drop(columns=["breakfast", "lunch", "dinner", "fat", "portland", "non-alcoholic"])

label = "breakfast" # select "breakfast", "lunch" or "dinner"

y = pd.DataFrame(data, columns=[label])
y = y.values.ravel()  # turn into numpy array, flatten to 1D for sklearn


# %% - - visual studio code cell - -


# set regularization (penalty) to None in sklearn's logistic reg class
# increase max iteration for solver to converge, known issue for well separated dataset

logit_b = linear_model.LogisticRegression(max_iter=12000, penalty=None)
logit_b.fit(x, y)

c = logit_b.coef_[0]  # numpy array of coefficients
bottom = np.argsort(c)  # indices of coefficients in order of size
top = np.flip(bottom)  # modify so that it is largest to smallest

top_b = x.iloc[:, top[0:30]].columns  # print out top 30 columns by coefficient size
bot_b = x.iloc[:, bottom[0:30]].columns  # print out bottom 30 columns by coefficient size

p_b = logit_b.predict_proba(x)[:, 1]
score_b = logit_b.score(x, y)


# %% - - visual studio code cell - -