# %% - - visual studio code cell - -


import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import DecisionBoundaryDisplay

from sklearn.model_selection import train_test_split


# %% - - visual studio code cell - -


# generate 2d gaussian data points and plot

# https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case

mean1 = [0, 0]
cov1 = [[1, 0], [0, 1]]

mean2 = [1, 1]
cov2 = [[0.2, 0], [0, 1]]

# .T = np.tranpose(), transposes a matrix
# transpose shape (1000, 2) to (2, 1000) for unpacking into 2 variables

gen = np.random.default_rng(seed = 0)
x, y = gen.multivariate_normal(mean1, cov1, 1000).T
a, b = gen.multivariate_normal(mean2, cov2, 1000).T

plt.plot(x, y, 'o')
plt.plot(a, b, 'o')


# %% - - visual studio code cell - -


# create labels and concatenate features

l = [1]*len(x) + [0]*len(a)
f = [np.concatenate((x, a)), np.concatenate((y, b))] # numpy arrays do not concatenate when added

plt.plot(f[0][0:1000], f[1][0:1000], 'o') # checking
plt.plot(f[0][1000:2000], f[1][1000:2000], 'o')

f = np.transpose(f) # shape (n_samples, n_features) required


# %% - - visual studio code cell - -


# demonstrate overfitting with max_depth = 10000

X_train, X_test, y_train, y_test = train_test_split(f, l, test_size = 0.10, random_state = 0)

clf = RandomForestClassifier(n_estimators = 100, max_depth = 10000, random_state = 0)
clf.fit(X_train, y_train)

disp = DecisionBoundaryDisplay.from_estimator(clf, X_test, response_method="predict")
disp.ax_.scatter(X_test[:, 0], X_test[:, 1], c = y_test, edgecolor = "k")
plt.text(-3.2, 2.8, "trees = 100" "\n" "max_depth = 10,000", fontsize = 12, bbox = dict(facecolor = 'white'))

proxy = [plt.Rectangle((0,0), 1, 1, fc = pc.get_facecolor()[0]) for pc in disp.surface_.collections]
plt.legend(handles=[proxy[0], proxy[6]], labels=[0, 1], loc='lower right', title='class')
plt.title('decision regions and test dataset points')

plt.savefig("fig3.png", dpi = 300, facecolor = 'white', edgecolor = 'white', bbox_inches = 'tight')

print("training set score : ", clf.score(X_train, y_train))
print("test set score : ", clf.score(X_test, y_test))


# %% - - visual studio code cell - -


# demonstrate that reducing to max_depth = 2 fixes the overfitting

X_train, X_test, y_train, y_test = train_test_split(f, l, test_size = 0.10, random_state = 0)

clf = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 0)
clf.fit(X_train, y_train)

disp = DecisionBoundaryDisplay.from_estimator(clf, X_test, response_method="predict")
disp.ax_.scatter(X_test[:, 0], X_test[:, 1], c = y_test, edgecolor = "k")
plt.text(-3.2, 2.8, "trees = 100" "\n" "max_depth = 2", fontsize = 12, bbox = dict(facecolor = 'white'))

proxy = [plt.Rectangle((0,0), 1, 1, fc = pc.get_facecolor()[0]) for pc in disp.surface_.collections]
plt.legend(handles=[proxy[0], proxy[6]], labels=[0, 1], loc='lower right', title='class')
plt.title('decision regions and test dataset points')

plt.savefig("fig5.png", dpi = 300, facecolor = 'white', edgecolor = 'white', bbox_inches = 'tight')

print("training set score : ", clf.score(X_train, y_train))
print("test set score : ", clf.score(X_test, y_test))


# %% - - visual studio code cell - -