# %% - - visual studio code cell - - import pandas as pd from matplotlib import pyplot as plt import numpy as np import statistics as stats # %% - - visual studio code cell - - # - - add a header, then convert to csv - - f = open("hepatitis.data", "r") x = f.readlines() f.close() g = open("hep.csv", "w") header = "survival,age,sex,steroid,antivirals,fatigue,malaise,anorexia," header += "liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin," header += "alk_phosphate,sgot,albumin,protime,histology\n" g.write(header) # write header row for i in x: g.write(i) g.close() # %% - - visual studio code cell - - # - - plot missing values - - hep = pd.read_csv("hep.csv") m = [] for col in hep.columns: m.append(sum(hep[col] == "?")) plt.rcParams.update({"font.size":16}) plt.bar(range(0,20),m) plt.xlabel("variable") plt.ylabel("count") plt.show() # %% - - visual studio code cell - - # - - fill in missing values - - for col in hep.columns: if "?" in pd.unique(hep[col]).tolist(): # has missing values # categorical columns with missing values if pd.unique(hep[col]).tolist().sort() == ['1', '2', '?'].sort(): r = 1 if sum(hep[col] == 2) > sum(hep[col] == 1): r = 2 hep.loc[hep[col] == "?", col] = 1 # non-categorical columns with missing values else: hep.loc[hep[col] == "?", col] = stats.median(hep[col]) hep.to_csv("hep_filled.csv", index=False) # %% - - visual studio code cell - -