In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

adults = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header = None,)

In [3]:

adults.head()

Out[3]:

	0	1	2	3	4	5	6	7	8	9	10	12	13	14
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

In [4]:

adults.columns = ["Age","Work_class","fnlwgt","Education","Education_num","Maritial_status","Occupation",
                 "Relationship","Race","Sex","Capital_gain","Capital_loss","hpw","native_country","Class"]

In [5]:

adults["Class"].unique()

Out[5]:

array([' <=50K', ' >50K'], dtype=object)

In [6]:

adults["Class"] = adults["Class"].map({' <=50K':0,' >50K':1})

In [7]:

for i in adults.columns:
    print("{} -- has : {} unique values which are :\n {}".format((i), adults[i].nunique(), adults[i].unique()),"\n")

Age -- has : 73 unique values which are :
 [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87] 

Work_class -- has : 9 unique values which are :
 [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

fnlwgt -- has : 21648 unique values which are :
 [ 77516  83311 215646 ...  34066  84661 257302] 

Education -- has : 16 unique values which are :
 [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

Education_num -- has : 16 unique values which are :
 [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8] 

Maritial_status -- has : 7 unique values which are :
 [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

Occupation -- has : 15 unique values which are :
 [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 

Relationship -- has : 6 unique values which are :
 [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

Race -- has : 5 unique values which are :
 [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

Sex -- has : 2 unique values which are :
 [' Male' ' Female'] 

Capital_gain -- has : 119 unique values which are :
 [ 2174     0 14084  5178  5013  2407 14344 15024  7688 34095  4064  4386
  7298  1409  3674  1055  3464  2050  2176   594 20051  6849  4101  1111
  8614  3411  2597 25236  4650  9386  2463  3103 10605  2964  3325  2580
  3471  4865 99999  6514  1471  2329  2105  2885 25124 10520  2202  2961
 27828  6767  2228  1506 13550  2635  5556  4787  3781  3137  3818  3942
   914   401  2829  2977  4934  2062  2354  5455 15020  1424  3273 22040
  4416  3908 10566   991  4931  1086  7430  6497   114  7896  2346  3418
  3432  2907  1151  2414  2290 15831 41310  4508  2538  3456  6418  1848
  3887  5721  9562  1455  2036  1831 11678  2936  2993  7443  6360  1797
  1173  4687  6723  2009  6097  2653  1639 18481  7978  2387  5060] 

Capital_loss -- has : 92 unique values which are :
 [   0 2042 1408 1902 1573 1887 1719 1762 1564 2179 1816 1980 1977 1876
 1340 2206 1741 1485 2339 2415 1380 1721 2051 2377 1669 2352 1672  653
 2392 1504 2001 1590 1651 1628 1848 1740 2002 1579 2258 1602  419 2547
 2174 2205 1726 2444 1138 2238  625  213 1539  880 1668 1092 1594 3004
 2231 1844  810 2824 2559 2057 1974  974 2149 1825 1735 1258 2129 2603
 2282  323 4356 2246 1617 1648 2489 3770 1755 3683 2267 2080 2457  155
 3900 2201 1944 2467 2163 2754 2472 1411] 

hpw -- has : 94 unique values which are :
 [40 13 16 45 50 80 30 35 60 20 52 44 15 25 38 43 55 48 58 32 70  2 22 56
 41 28 36 24 46 42 12 65  1 10 34 75 98 33 54  8  6 64 19 18 72  5  9 47
 37 21 26 14  4 59  7 99 53 39 62 57 78 90 66 11 49 84  3 17 68 27 85 31
 51 77 63 23 87 88 73 89 97 94 29 96 67 82 86 91 81 76 92 61 74 95] 

native_country -- has : 42 unique values which are :
 [' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands'] 

Class -- has : 2 unique values which are :
 [0 1]

In [8]:

categorical_columns = adults.columns[adults.dtypes!=np.int64]
categorical_columns

Out[8]:

Index(['Work_class', 'Education', 'Maritial_status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'native_country'],
      dtype='object')

In [9]:

# for i in categorical_columns:
#     adults[i] = adults[i].str.replace("\?","Nan")

In [9]:

adults.loc[:,categorical_columns].head(1)

Out[9]:

	Work_class	Education	Maritial_status	Occupation	Relationship	Race	Sex	native_country
0	State-gov	Bachelors	Never-married	Adm-clerical	Not-in-family	White	Male	United-States

In [10]:

maritial = adults.groupby("Maritial_status")["Class"].value_counts().unstack()

In [11]:

maritial.head()

Out[11]:

Class	0	1
Maritial_status
Divorced	3980	463
Married-AF-spouse	13	10
Married-civ-spouse	8284	6692
Married-spouse-absent	384	34
Never-married	10192	491

In [12]:

pd.crosstab(index =adults[ "Maritial_status"],columns = adults["Class"],
            normalize = "columns",
            margins = False,
            values=adults.Sex,
            aggfunc = "count").sort_values(1).plot(kind = "bar")
plt.title("Income vs Maritial Status")
plt.xlabel("Status")
plt.ylabel("% of Class")

Out[12]:

Text(0,0.5,'% of Class')

In [13]:

maritial.columns = ["<=50",">50"]
maritial.plot(kind = "barh",label =["<=50",">50"],color = ["firebrick","gold"],figsize = (10,4))
plt.legend()
plt.ylabel("Maritial Statues")
plt.xlabel("Number of Adults")
plt.title("Income vs Maritial Status")

Out[13]:

Text(0.5,1,'Income vs Maritial Status')

In [14]:

for i in categorical_columns:
    adults[i] = adults[i].str.replace(" ","")

In [15]:

adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().index

Out[15]:

MultiIndex(levels=[['Female', 'Male'], ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],
           names=['Sex', 'Maritial_status'])

In [16]:

female_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Female"]
male_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Male"]

In [17]:

MS = adults.groupby(["Maritial_status"])["Class"].value_counts().unstack()
MSP = MS.div(MS.sum(1),0)
MSP.columns = ["<=$50K",">$50K"]

In [18]:

p = MSP.plot(kind = "barh", color = ["slategray","peachpuff"])
t = p.get_yticklabels()
plt.xticks(np.arange(0,1.1,0.1));
plt.ylabel("Maritial Status")
plt.xlabel("% of Maritial sector")
plt.title("High Income vs Maritial Status")
plt.legend(loc = "upper right")
plt.legend(framealpha = 0.4)

Out[18]:

<matplotlib.legend.Legend at 0x1d37037e908>

In [19]:

for i in t:
    print(i)

Text(0,0,'Divorced')
Text(0,1,'Married-AF-spouse')
Text(0,2,'Married-civ-spouse')
Text(0,3,'Married-spouse-absent')
Text(0,4,'Never-married')
Text(0,5,'Separated')
Text(0,6,'Widowed')

In [20]:

Labels = []
for i in list(female_mar.index):
    Labels.append(i)
    
Labels

Out[20]:

['Divorced',
 'Married-AF-spouse',
 'Married-civ-spouse',
 'Married-spouse-absent',
 'Never-married',
 'Separated',
 'Widowed']

In [21]:

labels = {}
for i,n in enumerate(female_mar.index):
    labels[i+1] = n
labels

Out[21]:

{1: 'Divorced',
 2: 'Married-AF-spouse',
 3: 'Married-civ-spouse',
 4: 'Married-spouse-absent',
 5: 'Never-married',
 6: 'Separated',
 7: 'Widowed'}

In [22]:

labels[4]="Absent Spouse"

In [23]:

fig, (ax_left, ax_right) = plt.subplots(ncols=2,figsize = (20,6),sharex = False,sharey =False)
# fig.subplots_adjust(hspace =1)
# we dont need to pass on the actual y values to tick just yet , we can add range of integers and change the label later

bottm = list(labels.keys())
              
ax_left.barh(bottom = bottm,
             width =female_mar[0],
             height = 0.2,
             align='center',
             facecolor='brown',
             label = "=<50")

ax_left.barh(bottom = [i+0.2 for i in bottm],
             width =female_mar[1],
             height = 0.2,
             align='center',
             facecolor='gold',
             label = ">50")



ax_left.set_yticks([])
# ax_left.set_ylabel(Labels)
ax_left.invert_xaxis()


ax_right.barh(bottom = bottm,
             width = male_mar[0],
             height = 0.2,
             align = "center",
             facecolor = "brown",
             label = "=<50")

ax_right.barh(bottom = [i+0.2 for i in bottm],
             width = male_mar[1],
             height = 0.2,
             align = "center",
             facecolor = "gold",
             label = ">50")
ax_right.legend()
ax_left.legend()

ax_right.set_xticks(range(0,8000,500))
ax_left.set_xticks(range(0,8000,500))
ax_right.set_yticks(range(1,8))
# x moves tick labels relative to left edge of axes in axes units
ax_right.set_yticklabels(labels.values(),ha='center', x=-0.075,y = -0.2,rotation = 0, size =12);
# ax_right.set_xlabel('Male')
ax_right.set_xlabel("Male",size = 15)
ax_left.set_xlabel("Female",size = 15)
fig.suptitle("Income vs Sex & Maritial Status",size =15)

C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:12: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
  if sys.path[0] == '':
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:33: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:40: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0

Out[23]:

Text(0.5,0.98,'Income vs Sex & Maritial Status')

In [25]:

gen = adults.groupby("Class")["Sex"].value_counts().unstack()
gen = gen.div(gen.sum(1),0)
gen.index =["<=$50K",">$50K"]
gen.plot(kind= "bar", rot = 0, color =["salmon","dimgrey"],figsize = (12,6))
plt.title("Gender vs High Income", size = 15)
plt.xlabel("Income", size = 15)
plt.ylabel("% of Income section", size = 15)
plt.legend(framealpha = 0.4)

Out[25]:

<matplotlib.legend.Legend at 0x271be2f9c18>

In [26]:

#Visualising the correlation between the numercial values


names = list(adults.corr().columns)
correlations = adults.corr()
# plot correlation matrix
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,7,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names,rotation = 90)
ax.set_yticklabels(names)
plt.show()

In [27]:

ed_sex = adults.groupby("Education").Sex.value_counts().unstack()
ed_sex = ed_sex.div(ed_sex.sum(1),0)
ed_sex.plot(kind = "bar", color = ["Salmon","grey"])

Out[27]:

<matplotlib.axes._subplots.AxesSubplot at 0x271be69fda0>

In [28]:

ed = adults.groupby("Education")["Class"].value_counts().unstack().fillna(0)
ed = ed.sort_values(1)
ed.div(ed.sum(0),1).plot(kind = "bar", color = ["Salmon","grey"])

Out[28]:

<matplotlib.axes._subplots.AxesSubplot at 0x271bfb0a048>

In [29]:

rel = adults.groupby("Relationship")["Class"].value_counts().unstack()
rel.div(rel.sum(1),0).plot(kind = "bar", color = ["salmon","grey"])

Out[29]:

<matplotlib.axes._subplots.AxesSubplot at 0x271bfc8c208>

In [30]:

rel.plot(kind = "bar", color = ["salmon","grey"])

Out[30]:

<matplotlib.axes._subplots.AxesSubplot at 0x271bfb1dd68>

In [31]:

list(range(adults.Race.nunique()+1))

Out[31]:

[0, 1, 2, 3, 4, 5]

In [32]:

rac = adults.groupby("Race")["Class"].value_counts().unstack().sort_values(1)
rac.div(rac.sum(1),0).plot(kind = "barh", color= ["salmon","grey"])
plt.title("Income of Race groups")
plt.xlabel("% of Race ")
plt.ylabel("Race")

Out[32]:

<matplotlib.text.Text at 0x271bfffed68>

In [33]:

adults.groupby("native_country")["Class"].value_counts().unstack().sort_values(1).head()

Out[33]:

Class	0	1
native_country
Honduras	12.0	1.0
Nicaragua	32.0	2.0
Peru	29.0	2.0
Trinadad&Tobago	17.0	2.0
Columbia	57.0	2.0

In [24]:

numerical_columns = adults.columns[adults.dtypes == np.int64]
numerical_columns = numerical_columns.drop("Class")
numerical_columns

Out[24]:

Index(['Age', 'fnlwgt', 'Education_num', 'Capital_gain', 'Capital_loss',
       'hpw'],
      dtype='object')

In [25]:

for col in categorical_columns:
    print("{} has {} unique values:\n which are :\n {}".format(col, adults[col].nunique(), set(adults[col].unique())))

Work_class has 9 unique values:
 which are :
 {'State-gov', 'Without-pay', '?', 'Private', 'Local-gov', 'Self-emp-not-inc', 'Federal-gov', 'Never-worked', 'Self-emp-inc'}
Education has 16 unique values:
 which are :
 {'Doctorate', 'Prof-school', '11th', 'Assoc-acdm', '1st-4th', '5th-6th', 'Preschool', 'Bachelors', '12th', 'HS-grad', '7th-8th', 'Assoc-voc', 'Masters', 'Some-college', '10th', '9th'}
Maritial_status has 7 unique values:
 which are :
 {'Married-civ-spouse', 'Widowed', 'Divorced', 'Separated', 'Married-AF-spouse', 'Never-married', 'Married-spouse-absent'}
Occupation has 15 unique values:
 which are :
 {'Tech-support', 'Craft-repair', 'Sales', 'Transport-moving', 'Handlers-cleaners', '?', 'Farming-fishing', 'Exec-managerial', 'Priv-house-serv', 'Armed-Forces', 'Protective-serv', 'Prof-specialty', 'Adm-clerical', 'Machine-op-inspct', 'Other-service'}
Relationship has 6 unique values:
 which are :
 {'Husband', 'Own-child', 'Unmarried', 'Not-in-family', 'Other-relative', 'Wife'}
Race has 5 unique values:
 which are :
 {'Other', 'Amer-Indian-Eskimo', 'White', 'Asian-Pac-Islander', 'Black'}
Sex has 2 unique values:
 which are :
 {'Male', 'Female'}
native_country has 42 unique values:
 which are :
 {'Mexico', '?', 'Canada', 'Puerto-Rico', 'Trinadad&Tobago', 'Nicaragua', 'Jamaica', 'Cuba', 'China', 'Honduras', 'Holand-Netherlands', 'Philippines', 'Thailand', 'Poland', 'Japan', 'Ireland', 'United-States', 'Yugoslavia', 'Hungary', 'Scotland', 'Vietnam', 'Greece', 'South', 'Ecuador', 'India', 'Taiwan', 'Cambodia', 'Laos', 'Germany', 'Haiti', 'Dominican-Republic', 'France', 'Guatemala', 'Italy', 'Hong', 'Columbia', 'Outlying-US(Guam-USVI-etc)', 'England', 'Peru', 'Portugal', 'Iran', 'El-Salvador'}

In [26]:

dummy = pd.get_dummies(adults[categorical_columns],drop_first=True)
dummy.head()

Out[26]:

	Work_class_Private	Work_class_Self-emp-not-inc	Work_class_State-gov	Education_11th	...	native_country_United-States
0	0	0	1	0	...	1
1	0	1	0	0	...	1
2	1	0	0	0	...	1
3	1	0	0	1	...	1
4	1	0	0	0	...	0

5 rows × 94 columns

In [27]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
adults[numerical_columns] = sc.fit_transform(adults[numerical_columns])

In [28]:

adults[numerical_columns].head()

Out[28]:

	Age	fnlwgt	Education_num	Capital_gain	Capital_loss	hpw
0	0.030671	-1.063611	1.134739	0.148453	-0.21666	-0.035429
1	0.837109	-1.008707	1.134739	-0.145920	-0.21666	-2.222153
2	-0.042642	0.245079	-0.420060	-0.145920	-0.21666	-0.035429
3	1.057047	0.425801	-1.197459	-0.145920	-0.21666	-0.035429
4	-0.775768	1.408176	1.134739	-0.145920	-0.21666	-0.035429

In [29]:

data = pd.concat([dummy,adults[numerical_columns],adults["Class"]], axis = 1)

In [30]:

X = data.iloc[: , :-1].values
y = data.iloc[:,-1].values

In [31]:

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

c1 = KNeighborsClassifier(n_neighbors=1)
c2 = RandomForestClassifier(random_state=1)
c3 = GaussianNB()
c4 = SVC()
lr = LogisticRegression()


for c , label in zip([c1,c2,c3,c4],["KNN","RandomForest","Naive Bayes","Support Vector Machine"]):
    scores = cross_val_score(c, X, y , cv = 3, scoring = "accuracy")
    print("accuracy {} +/- {} [{}]".format(scores.mean() , scores.std(), label))

accuracy 0.8014189494822662 +/- 0.0025677997025294486 [KNN]
accuracy 0.8462884685200729 +/- 0.0010725973236293852 [RandomForest]
accuracy 0.48693162620974345 +/- 0.03332009017352423 [Naive Bayes]
accuracy 0.8526766107806513 +/- 0.0038549657632717853 [Support Vector Machine]

In [32]:

from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y ,test_size = 0.15 , random_state = 42)

In [34]:

from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=100 , max_depth=2)
classifier.fit(X_train , y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test , y_pred)
score

C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

Out[34]:

0.8607983623336745

In [35]:

x_score = cross_val_score(classifier , X, y , scoring="accuracy", cv = 10)
print(x_score.mean(),"+/-",x_score.std() )

C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

0.8582048756826159 +/- 0.004796749054344141

C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:

In [45]:

from tpot import TPOTClassifier
tpot_classifier = TPOTClassifier(generations=5,population_size=60,verbosity=2,max_time_mins=2)
tpot_classifier.fit(X,y)

TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: GradientBoostingClassifier(RFE(input_matrix, RFE__ExtraTreesClassifier__criterion=entropy, RFE__ExtraTreesClassifier__max_features=0.35, RFE__ExtraTreesClassifier__n_estimators=100, RFE__step=0.6), GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=8, GradientBoostingClassifier__max_features=0.9, GradientBoostingClassifier__min_samples_leaf=1, GradientBoostingClassifier__min_samples_split=18, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.8)

Out[45]:

TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        generations=1000000, max_eval_time_mins=5, max_time_mins=2,
        mutation_rate=0.9, n_jobs=1, offspring_size=60, population_size=60,
        random_state=None, scoring=None, subsample=1.0, verbosity=2,
        warm_start=False)

In [47]:

tpot_classifier.score(X_test,y_test)

Out[47]:

0.91136131013306043

In [48]:

tpot_classifier.export("Adults_Ml.py")

In [50]:

# %load Adults_ML.py
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'class' in the data file
# tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
# features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(X, y, random_state=42)

exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.35000000000000003, n_estimators=100), step=0.6000000000000001),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=8, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=18, n_estimators=100, subsample=0.8)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
results

Out[50]:

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [53]:

accuracy_score(testing_target, results)

Out[53]:

0.87483110183024193

In [54]:

cross_val_score(exported_pipeline, X, y, cv = 10 , scoring = "accuracy").mean()

Out[54]:

0.87153359537839115

	Work_class_Private	Work_class_Self-emp-not-inc	Work_class_State-gov	Education_11th	...	native_country_United-States
0	0	0	1	0	...	1
1	0	1	0	0	...	1
2	1	0	0	0	...	1
3	1	0	0	1	...	1
4	1	0	0	0	...	0

	Work_class_Private	Work_class_Self-emp-not-inc	Work_class_State-gov	Education_11th	...	native_country_United-States
0	0	0	1	0	...	1
1	0	1	0	0	...	1
2	1	0	0	0	...	1
3	1	0	0	1	...	1
4	1	0	0	0	...	0

Adults Income Analysis

	Work_class_Private	Work_class_Self-emp-not-inc	Work_class_State-gov	Education_11th	...	native_country_United-States
0	0	0	1	0	...	1
1	0	1	0	0	...	1
2	1	0	0	0	...	1
3	1	0	0	1	...	1
4	1	0	0	0	...	0