# %% - - visual studio code cell - - import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.inspection import DecisionBoundaryDisplay from sklearn.model_selection import train_test_split # %% - - visual studio code cell - - # generate 2d gaussian data points and plot # https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Bivariate_case mean1 = [0, 0] cov1 = [[1, 0], [0, 1]] mean2 = [1, 1] cov2 = [[0.2, 0], [0, 1]] # .T = np.tranpose(), transposes a matrix # transpose shape (1000, 2) to (2, 1000) for unpacking into 2 variables gen = np.random.default_rng(seed = 0) x, y = gen.multivariate_normal(mean1, cov1, 1000).T a, b = gen.multivariate_normal(mean2, cov2, 1000).T plt.plot(x, y, 'o') plt.plot(a, b, 'o') # %% - - visual studio code cell - - # create labels and concatenate features l = [1]*len(x) + [0]*len(a) f = [np.concatenate((x, a)), np.concatenate((y, b))] # numpy arrays do not concatenate when added plt.plot(f[0][0:1000], f[1][0:1000], 'o') # checking plt.plot(f[0][1000:2000], f[1][1000:2000], 'o') f = np.transpose(f) # shape (n_samples, n_features) required # %% - - visual studio code cell - - # demonstrate overfitting with max_depth = 10000 X_train, X_test, y_train, y_test = train_test_split(f, l, test_size = 0.10, random_state = 0) clf = RandomForestClassifier(n_estimators = 100, max_depth = 10000, random_state = 0) clf.fit(X_train, y_train) disp = DecisionBoundaryDisplay.from_estimator(clf, X_test, response_method="predict") disp.ax_.scatter(X_test[:, 0], X_test[:, 1], c = y_test, edgecolor = "k") plt.text(-3.2, 2.8, "trees = 100" "\n" "max_depth = 10,000", fontsize = 12, bbox = dict(facecolor = 'white')) proxy = [plt.Rectangle((0,0), 1, 1, fc = pc.get_facecolor()[0]) for pc in disp.surface_.collections] plt.legend(handles=[proxy[0], proxy[6]], labels=[0, 1], loc='lower right', title='class') plt.title('decision regions and test dataset points') plt.savefig("fig3.png", dpi = 300, facecolor = 'white', edgecolor = 'white', bbox_inches = 'tight') print("training set score : ", clf.score(X_train, y_train)) print("test set score : ", clf.score(X_test, y_test)) # %% - - visual studio code cell - - # demonstrate that reducing to max_depth = 2 fixes the overfitting X_train, X_test, y_train, y_test = train_test_split(f, l, test_size = 0.10, random_state = 0) clf = RandomForestClassifier(n_estimators = 100, max_depth = 2, random_state = 0) clf.fit(X_train, y_train) disp = DecisionBoundaryDisplay.from_estimator(clf, X_test, response_method="predict") disp.ax_.scatter(X_test[:, 0], X_test[:, 1], c = y_test, edgecolor = "k") plt.text(-3.2, 2.8, "trees = 100" "\n" "max_depth = 2", fontsize = 12, bbox = dict(facecolor = 'white')) proxy = [plt.Rectangle((0,0), 1, 1, fc = pc.get_facecolor()[0]) for pc in disp.surface_.collections] plt.legend(handles=[proxy[0], proxy[6]], labels=[0, 1], loc='lower right', title='class') plt.title('decision regions and test dataset points') plt.savefig("fig5.png", dpi = 300, facecolor = 'white', edgecolor = 'white', bbox_inches = 'tight') print("training set score : ", clf.score(X_train, y_train)) print("test set score : ", clf.score(X_test, y_test)) # %% - - visual studio code cell - -