import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
adults = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header = None,)
adults.head()
adults.columns = ["Age","Work_class","fnlwgt","Education","Education_num","Maritial_status","Occupation",
"Relationship","Race","Sex","Capital_gain","Capital_loss","hpw","native_country","Class"]
adults["Class"].unique()
adults["Class"] = adults["Class"].map({' <=50K':0,' >50K':1})
for i in adults.columns:
print("{} -- has : {} unique values which are :\n {}".format((i), adults[i].nunique(), adults[i].unique()),"\n")
categorical_columns = adults.columns[adults.dtypes!=np.int64]
categorical_columns
# for i in categorical_columns:
# adults[i] = adults[i].str.replace("\?","Nan")
adults.loc[:,categorical_columns].head(1)
maritial = adults.groupby("Maritial_status")["Class"].value_counts().unstack()
maritial.head()
pd.crosstab(index =adults[ "Maritial_status"],columns = adults["Class"],
normalize = "columns",
margins = False,
values=adults.Sex,
aggfunc = "count").sort_values(1).plot(kind = "bar")
plt.title("Income vs Maritial Status")
plt.xlabel("Status")
plt.ylabel("% of Class")
maritial.columns = ["<=50",">50"]
maritial.plot(kind = "barh",label =["<=50",">50"],color = ["firebrick","gold"],figsize = (10,4))
plt.legend()
plt.ylabel("Maritial Statues")
plt.xlabel("Number of Adults")
plt.title("Income vs Maritial Status")
for i in categorical_columns:
adults[i] = adults[i].str.replace(" ","")
adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().index
female_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Female"]
male_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Male"]
MS = adults.groupby(["Maritial_status"])["Class"].value_counts().unstack()
MSP = MS.div(MS.sum(1),0)
MSP.columns = ["<=$50K",">$50K"]
p = MSP.plot(kind = "barh", color = ["slategray","peachpuff"])
t = p.get_yticklabels()
plt.xticks(np.arange(0,1.1,0.1));
plt.ylabel("Maritial Status")
plt.xlabel("% of Maritial sector")
plt.title("High Income vs Maritial Status")
plt.legend(loc = "upper right")
plt.legend(framealpha = 0.4)
for i in t:
print(i)
Labels = []
for i in list(female_mar.index):
Labels.append(i)
Labels
labels = {}
for i,n in enumerate(female_mar.index):
labels[i+1] = n
labels
labels[4]="Absent Spouse"
fig, (ax_left, ax_right) = plt.subplots(ncols=2,figsize = (20,6),sharex = False,sharey =False)
# fig.subplots_adjust(hspace =1)
# we dont need to pass on the actual y values to tick just yet , we can add range of integers and change the label later
bottm = list(labels.keys())
ax_left.barh(bottom = bottm,
width =female_mar[0],
height = 0.2,
align='center',
facecolor='brown',
label = "=<50")
ax_left.barh(bottom = [i+0.2 for i in bottm],
width =female_mar[1],
height = 0.2,
align='center',
facecolor='gold',
label = ">50")
ax_left.set_yticks([])
# ax_left.set_ylabel(Labels)
ax_left.invert_xaxis()
ax_right.barh(bottom = bottm,
width = male_mar[0],
height = 0.2,
align = "center",
facecolor = "brown",
label = "=<50")
ax_right.barh(bottom = [i+0.2 for i in bottm],
width = male_mar[1],
height = 0.2,
align = "center",
facecolor = "gold",
label = ">50")
ax_right.legend()
ax_left.legend()
ax_right.set_xticks(range(0,8000,500))
ax_left.set_xticks(range(0,8000,500))
ax_right.set_yticks(range(1,8))
# x moves tick labels relative to left edge of axes in axes units
ax_right.set_yticklabels(labels.values(),ha='center', x=-0.075,y = -0.2,rotation = 0, size =12);
# ax_right.set_xlabel('Male')
ax_right.set_xlabel("Male",size = 15)
ax_left.set_xlabel("Female",size = 15)
fig.suptitle("Income vs Sex & Maritial Status",size =15)
gen = adults.groupby("Class")["Sex"].value_counts().unstack()
gen = gen.div(gen.sum(1),0)
gen.index =["<=$50K",">$50K"]
gen.plot(kind= "bar", rot = 0, color =["salmon","dimgrey"],figsize = (12,6))
plt.title("Gender vs High Income", size = 15)
plt.xlabel("Income", size = 15)
plt.ylabel("% of Income section", size = 15)
plt.legend(framealpha = 0.4)
#Visualising the correlation between the numercial values
names = list(adults.corr().columns)
correlations = adults.corr()
# plot correlation matrix
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,7,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names,rotation = 90)
ax.set_yticklabels(names)
plt.show()
ed_sex = adults.groupby("Education").Sex.value_counts().unstack()
ed_sex = ed_sex.div(ed_sex.sum(1),0)
ed_sex.plot(kind = "bar", color = ["Salmon","grey"])
ed = adults.groupby("Education")["Class"].value_counts().unstack().fillna(0)
ed = ed.sort_values(1)
ed.div(ed.sum(0),1).plot(kind = "bar", color = ["Salmon","grey"])
rel = adults.groupby("Relationship")["Class"].value_counts().unstack()
rel.div(rel.sum(1),0).plot(kind = "bar", color = ["salmon","grey"])
rel.plot(kind = "bar", color = ["salmon","grey"])
list(range(adults.Race.nunique()+1))
rac = adults.groupby("Race")["Class"].value_counts().unstack().sort_values(1)
rac.div(rac.sum(1),0).plot(kind = "barh", color= ["salmon","grey"])
plt.title("Income of Race groups")
plt.xlabel("% of Race ")
plt.ylabel("Race")
adults.groupby("native_country")["Class"].value_counts().unstack().sort_values(1).head()
numerical_columns = adults.columns[adults.dtypes == np.int64]
numerical_columns = numerical_columns.drop("Class")
numerical_columns
for col in categorical_columns:
print("{} has {} unique values:\n which are :\n {}".format(col, adults[col].nunique(), set(adults[col].unique())))
dummy = pd.get_dummies(adults[categorical_columns],drop_first=True)
dummy.head()
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
adults[numerical_columns] = sc.fit_transform(adults[numerical_columns])
adults[numerical_columns].head()
data = pd.concat([dummy,adults[numerical_columns],adults["Class"]], axis = 1)
X = data.iloc[: , :-1].values
y = data.iloc[:,-1].values
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
c1 = KNeighborsClassifier(n_neighbors=1)
c2 = RandomForestClassifier(random_state=1)
c3 = GaussianNB()
c4 = SVC()
lr = LogisticRegression()
for c , label in zip([c1,c2,c3,c4],["KNN","RandomForest","Naive Bayes","Support Vector Machine"]):
scores = cross_val_score(c, X, y , cv = 3, scoring = "accuracy")
print("accuracy {} +/- {} [{}]".format(scores.mean() , scores.std(), label))
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y ,test_size = 0.15 , random_state = 42)
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=100 , max_depth=2)
classifier.fit(X_train , y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test , y_pred)
score
x_score = cross_val_score(classifier , X, y , scoring="accuracy", cv = 10)
print(x_score.mean(),"+/-",x_score.std() )
from tpot import TPOTClassifier
tpot_classifier = TPOTClassifier(generations=5,population_size=60,verbosity=2,max_time_mins=2)
tpot_classifier.fit(X,y)
tpot_classifier.score(X_test,y_test)
tpot_classifier.export("Adults_Ml.py")
# %load Adults_ML.py
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
# NOTE: Make sure that the class is labeled 'class' in the data file
# tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
# features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = \
train_test_split(X, y, random_state=42)
exported_pipeline = make_pipeline(
RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.35000000000000003, n_estimators=100), step=0.6000000000000001),
GradientBoostingClassifier(learning_rate=0.1, max_depth=8, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=18, n_estimators=100, subsample=0.8)
)
exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
results
accuracy_score(testing_target, results)
cross_val_score(exported_pipeline, X, y, cv = 10 , scoring = "accuracy").mean()