# %% - - visual studio code cell - -

import pandas as pd
from matplotlib import pyplot as plt

import numpy as np
import statistics as stats


# %% - - visual studio code cell - -


# - - add a header, then convert to csv - -


f = open("hepatitis.data", "r")		
x = f.readlines()
f.close()

g = open("hep.csv", "w")

header = "survival,age,sex,steroid,antivirals,fatigue,malaise,anorexia,"
header += "liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,"
header += "alk_phosphate,sgot,albumin,protime,histology\n"

g.write(header) # write header row

for i in x:
    g.write(i)

g.close()


# %% - - visual studio code cell - -


# - - plot missing values - -


hep = pd.read_csv("hep.csv")
m = []

for col in hep.columns:
    m.append(sum(hep[col] == "?"))

plt.rcParams.update({"font.size":16})
plt.bar(range(0,20),m)
plt.xlabel("variable")
plt.ylabel("count")
plt.show()


# %% - - visual studio code cell - -

# - - fill in missing values - -

for col in hep.columns:
    
    if "?" in pd.unique(hep[col]).tolist(): # has missing values

        # categorical columns with missing values

        if pd.unique(hep[col]).tolist().sort() == ['1', '2', '?'].sort():
            
            r = 1

            if sum(hep[col] == 2) > sum(hep[col] == 1):
                
                r = 2

            hep.loc[hep[col] == "?", col] = 1

        # non-categorical columns with missing values

        else:

            hep.loc[hep[col] == "?", col] = stats.median(hep[col])

hep.to_csv("hep_filled.csv", index=False)


# %% - - visual studio code cell - -