Adults Income Analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
adults = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header = None,)
In [3]:
adults.head()
Out[3]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
In [4]:
adults.columns = ["Age","Work_class","fnlwgt","Education","Education_num","Maritial_status","Occupation",
                 "Relationship","Race","Sex","Capital_gain","Capital_loss","hpw","native_country","Class"]
In [5]:
adults["Class"].unique()
Out[5]:
array([' <=50K', ' >50K'], dtype=object)
In [6]:
adults["Class"] = adults["Class"].map({' <=50K':0,' >50K':1})
In [7]:
for i in adults.columns:
    print("{} -- has : {} unique values which are :\n {}".format((i), adults[i].nunique(), adults[i].unique()),"\n")
Age -- has : 73 unique values which are :
 [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87] 

Work_class -- has : 9 unique values which are :
 [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked'] 

fnlwgt -- has : 21648 unique values which are :
 [ 77516  83311 215646 ...  34066  84661 257302] 

Education -- has : 16 unique values which are :
 [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th'] 

Education_num -- has : 16 unique values which are :
 [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8] 

Maritial_status -- has : 7 unique values which are :
 [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed'] 

Occupation -- has : 15 unique values which are :
 [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv'] 

Relationship -- has : 6 unique values which are :
 [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative'] 

Race -- has : 5 unique values which are :
 [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other'] 

Sex -- has : 2 unique values which are :
 [' Male' ' Female'] 

Capital_gain -- has : 119 unique values which are :
 [ 2174     0 14084  5178  5013  2407 14344 15024  7688 34095  4064  4386
  7298  1409  3674  1055  3464  2050  2176   594 20051  6849  4101  1111
  8614  3411  2597 25236  4650  9386  2463  3103 10605  2964  3325  2580
  3471  4865 99999  6514  1471  2329  2105  2885 25124 10520  2202  2961
 27828  6767  2228  1506 13550  2635  5556  4787  3781  3137  3818  3942
   914   401  2829  2977  4934  2062  2354  5455 15020  1424  3273 22040
  4416  3908 10566   991  4931  1086  7430  6497   114  7896  2346  3418
  3432  2907  1151  2414  2290 15831 41310  4508  2538  3456  6418  1848
  3887  5721  9562  1455  2036  1831 11678  2936  2993  7443  6360  1797
  1173  4687  6723  2009  6097  2653  1639 18481  7978  2387  5060] 

Capital_loss -- has : 92 unique values which are :
 [   0 2042 1408 1902 1573 1887 1719 1762 1564 2179 1816 1980 1977 1876
 1340 2206 1741 1485 2339 2415 1380 1721 2051 2377 1669 2352 1672  653
 2392 1504 2001 1590 1651 1628 1848 1740 2002 1579 2258 1602  419 2547
 2174 2205 1726 2444 1138 2238  625  213 1539  880 1668 1092 1594 3004
 2231 1844  810 2824 2559 2057 1974  974 2149 1825 1735 1258 2129 2603
 2282  323 4356 2246 1617 1648 2489 3770 1755 3683 2267 2080 2457  155
 3900 2201 1944 2467 2163 2754 2472 1411] 

hpw -- has : 94 unique values which are :
 [40 13 16 45 50 80 30 35 60 20 52 44 15 25 38 43 55 48 58 32 70  2 22 56
 41 28 36 24 46 42 12 65  1 10 34 75 98 33 54  8  6 64 19 18 72  5  9 47
 37 21 26 14  4 59  7 99 53 39 62 57 78 90 66 11 49 84  3 17 68 27 85 31
 51 77 63 23 87 88 73 89 97 94 29 96 67 82 86 91 81 76 92 61 74 95] 

native_country -- has : 42 unique values which are :
 [' United-States' ' Cuba' ' Jamaica' ' India' ' ?' ' Mexico' ' South'
 ' Puerto-Rico' ' Honduras' ' England' ' Canada' ' Germany' ' Iran'
 ' Philippines' ' Italy' ' Poland' ' Columbia' ' Cambodia' ' Thailand'
 ' Ecuador' ' Laos' ' Taiwan' ' Haiti' ' Portugal' ' Dominican-Republic'
 ' El-Salvador' ' France' ' Guatemala' ' China' ' Japan' ' Yugoslavia'
 ' Peru' ' Outlying-US(Guam-USVI-etc)' ' Scotland' ' Trinadad&Tobago'
 ' Greece' ' Nicaragua' ' Vietnam' ' Hong' ' Ireland' ' Hungary'
 ' Holand-Netherlands'] 

Class -- has : 2 unique values which are :
 [0 1] 

In [8]:
categorical_columns = adults.columns[adults.dtypes!=np.int64]
categorical_columns
Out[8]:
Index(['Work_class', 'Education', 'Maritial_status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'native_country'],
      dtype='object')
In [9]:
# for i in categorical_columns:
#     adults[i] = adults[i].str.replace("\?","Nan")
In [9]:
adults.loc[:,categorical_columns].head(1)
Out[9]:
Work_class Education Maritial_status Occupation Relationship Race Sex native_country
0 State-gov Bachelors Never-married Adm-clerical Not-in-family White Male United-States
In [10]:
maritial = adults.groupby("Maritial_status")["Class"].value_counts().unstack()
In [11]:
maritial.head()
Out[11]:
Class 0 1
Maritial_status
Divorced 3980 463
Married-AF-spouse 13 10
Married-civ-spouse 8284 6692
Married-spouse-absent 384 34
Never-married 10192 491
In [12]:
pd.crosstab(index =adults[ "Maritial_status"],columns = adults["Class"],
            normalize = "columns",
            margins = False,
            values=adults.Sex,
            aggfunc = "count").sort_values(1).plot(kind = "bar")
plt.title("Income vs Maritial Status")
plt.xlabel("Status")
plt.ylabel("% of Class")
Out[12]:
Text(0,0.5,'% of Class')
In [13]:
maritial.columns = ["<=50",">50"]
maritial.plot(kind = "barh",label =["<=50",">50"],color = ["firebrick","gold"],figsize = (10,4))
plt.legend()
plt.ylabel("Maritial Statues")
plt.xlabel("Number of Adults")
plt.title("Income vs Maritial Status")
Out[13]:
Text(0.5,1,'Income vs Maritial Status')
In [14]:
for i in categorical_columns:
    adults[i] = adults[i].str.replace(" ","")
In [15]:
adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().index
Out[15]:
MultiIndex(levels=[['Female', 'Male'], ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed']],
           labels=[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],
           names=['Sex', 'Maritial_status'])
In [16]:
female_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Female"]
male_mar = adults.groupby(["Sex","Maritial_status"])["Class"].value_counts().unstack().loc["Male"]
In [17]:
MS = adults.groupby(["Maritial_status"])["Class"].value_counts().unstack()
MSP = MS.div(MS.sum(1),0)
MSP.columns = ["<=$50K",">$50K"]
In [18]:
p = MSP.plot(kind = "barh", color = ["slategray","peachpuff"])
t = p.get_yticklabels()
plt.xticks(np.arange(0,1.1,0.1));
plt.ylabel("Maritial Status")
plt.xlabel("% of Maritial sector")
plt.title("High Income vs Maritial Status")
plt.legend(loc = "upper right")
plt.legend(framealpha = 0.4)
Out[18]:
<matplotlib.legend.Legend at 0x1d37037e908>
In [19]:
for i in t:
    print(i)
Text(0,0,'Divorced')
Text(0,1,'Married-AF-spouse')
Text(0,2,'Married-civ-spouse')
Text(0,3,'Married-spouse-absent')
Text(0,4,'Never-married')
Text(0,5,'Separated')
Text(0,6,'Widowed')
In [20]:
Labels = []
for i in list(female_mar.index):
    Labels.append(i)
    
Labels
Out[20]:
['Divorced',
 'Married-AF-spouse',
 'Married-civ-spouse',
 'Married-spouse-absent',
 'Never-married',
 'Separated',
 'Widowed']
In [21]:
labels = {}
for i,n in enumerate(female_mar.index):
    labels[i+1] = n
labels
Out[21]:
{1: 'Divorced',
 2: 'Married-AF-spouse',
 3: 'Married-civ-spouse',
 4: 'Married-spouse-absent',
 5: 'Never-married',
 6: 'Separated',
 7: 'Widowed'}
In [22]:
labels[4]="Absent Spouse"
In [23]:
fig, (ax_left, ax_right) = plt.subplots(ncols=2,figsize = (20,6),sharex = False,sharey =False)
# fig.subplots_adjust(hspace =1)
# we dont need to pass on the actual y values to tick just yet , we can add range of integers and change the label later

bottm = list(labels.keys())
              
ax_left.barh(bottom = bottm,
             width =female_mar[0],
             height = 0.2,
             align='center',
             facecolor='brown',
             label = "=<50")

ax_left.barh(bottom = [i+0.2 for i in bottm],
             width =female_mar[1],
             height = 0.2,
             align='center',
             facecolor='gold',
             label = ">50")



ax_left.set_yticks([])
# ax_left.set_ylabel(Labels)
ax_left.invert_xaxis()


ax_right.barh(bottom = bottm,
             width = male_mar[0],
             height = 0.2,
             align = "center",
             facecolor = "brown",
             label = "=<50")

ax_right.barh(bottom = [i+0.2 for i in bottm],
             width = male_mar[1],
             height = 0.2,
             align = "center",
             facecolor = "gold",
             label = ">50")
ax_right.legend()
ax_left.legend()

ax_right.set_xticks(range(0,8000,500))
ax_left.set_xticks(range(0,8000,500))
ax_right.set_yticks(range(1,8))
# x moves tick labels relative to left edge of axes in axes units
ax_right.set_yticklabels(labels.values(),ha='center', x=-0.075,y = -0.2,rotation = 0, size =12);
# ax_right.set_xlabel('Male')
ax_right.set_xlabel("Male",size = 15)
ax_left.set_xlabel("Female",size = 15)
fig.suptitle("Income vs Sex & Maritial Status",size =15)
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:12: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
  if sys.path[0] == '':
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:33: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
C:\Users\Mostafa\Anaconda3\lib\site-packages\ipykernel_launcher.py:40: MatplotlibDeprecationWarning: The *bottom* kwarg to `barh` is deprecated use *y* instead. Support for *bottom* will be removed in Matplotlib 3.0
Out[23]:
Text(0.5,0.98,'Income vs Sex & Maritial Status')
In [25]:
gen = adults.groupby("Class")["Sex"].value_counts().unstack()
gen = gen.div(gen.sum(1),0)
gen.index =["<=$50K",">$50K"]
gen.plot(kind= "bar", rot = 0, color =["salmon","dimgrey"],figsize = (12,6))
plt.title("Gender vs High Income", size = 15)
plt.xlabel("Income", size = 15)
plt.ylabel("% of Income section", size = 15)
plt.legend(framealpha = 0.4)
Out[25]:
<matplotlib.legend.Legend at 0x271be2f9c18>
In [26]:
#Visualising the correlation between the numercial values


names = list(adults.corr().columns)
correlations = adults.corr()
# plot correlation matrix
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,7,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names,rotation = 90)
ax.set_yticklabels(names)
plt.show()
In [27]:
ed_sex = adults.groupby("Education").Sex.value_counts().unstack()
ed_sex = ed_sex.div(ed_sex.sum(1),0)
ed_sex.plot(kind = "bar", color = ["Salmon","grey"])
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x271be69fda0>
In [28]:
ed = adults.groupby("Education")["Class"].value_counts().unstack().fillna(0)
ed = ed.sort_values(1)
ed.div(ed.sum(0),1).plot(kind = "bar", color = ["Salmon","grey"])
Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x271bfb0a048>
In [29]:
rel = adults.groupby("Relationship")["Class"].value_counts().unstack()
rel.div(rel.sum(1),0).plot(kind = "bar", color = ["salmon","grey"])
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x271bfc8c208>
In [30]:
rel.plot(kind = "bar", color = ["salmon","grey"])
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x271bfb1dd68>
In [31]:
list(range(adults.Race.nunique()+1))
Out[31]:
[0, 1, 2, 3, 4, 5]
In [32]:
rac = adults.groupby("Race")["Class"].value_counts().unstack().sort_values(1)
rac.div(rac.sum(1),0).plot(kind = "barh", color= ["salmon","grey"])
plt.title("Income of Race groups")
plt.xlabel("% of Race ")
plt.ylabel("Race")
Out[32]:
<matplotlib.text.Text at 0x271bfffed68>
In [33]:
adults.groupby("native_country")["Class"].value_counts().unstack().sort_values(1).head()
Out[33]:
Class 0 1
native_country
Honduras 12.0 1.0
Nicaragua 32.0 2.0
Peru 29.0 2.0
Trinadad&Tobago 17.0 2.0
Columbia 57.0 2.0
In [24]:
numerical_columns = adults.columns[adults.dtypes == np.int64]
numerical_columns = numerical_columns.drop("Class")
numerical_columns
Out[24]:
Index(['Age', 'fnlwgt', 'Education_num', 'Capital_gain', 'Capital_loss',
       'hpw'],
      dtype='object')
In [25]:
for col in categorical_columns:
    print("{} has {} unique values:\n which are :\n {}".format(col, adults[col].nunique(), set(adults[col].unique())))
Work_class has 9 unique values:
 which are :
 {'State-gov', 'Without-pay', '?', 'Private', 'Local-gov', 'Self-emp-not-inc', 'Federal-gov', 'Never-worked', 'Self-emp-inc'}
Education has 16 unique values:
 which are :
 {'Doctorate', 'Prof-school', '11th', 'Assoc-acdm', '1st-4th', '5th-6th', 'Preschool', 'Bachelors', '12th', 'HS-grad', '7th-8th', 'Assoc-voc', 'Masters', 'Some-college', '10th', '9th'}
Maritial_status has 7 unique values:
 which are :
 {'Married-civ-spouse', 'Widowed', 'Divorced', 'Separated', 'Married-AF-spouse', 'Never-married', 'Married-spouse-absent'}
Occupation has 15 unique values:
 which are :
 {'Tech-support', 'Craft-repair', 'Sales', 'Transport-moving', 'Handlers-cleaners', '?', 'Farming-fishing', 'Exec-managerial', 'Priv-house-serv', 'Armed-Forces', 'Protective-serv', 'Prof-specialty', 'Adm-clerical', 'Machine-op-inspct', 'Other-service'}
Relationship has 6 unique values:
 which are :
 {'Husband', 'Own-child', 'Unmarried', 'Not-in-family', 'Other-relative', 'Wife'}
Race has 5 unique values:
 which are :
 {'Other', 'Amer-Indian-Eskimo', 'White', 'Asian-Pac-Islander', 'Black'}
Sex has 2 unique values:
 which are :
 {'Male', 'Female'}
native_country has 42 unique values:
 which are :
 {'Mexico', '?', 'Canada', 'Puerto-Rico', 'Trinadad&Tobago', 'Nicaragua', 'Jamaica', 'Cuba', 'China', 'Honduras', 'Holand-Netherlands', 'Philippines', 'Thailand', 'Poland', 'Japan', 'Ireland', 'United-States', 'Yugoslavia', 'Hungary', 'Scotland', 'Vietnam', 'Greece', 'South', 'Ecuador', 'India', 'Taiwan', 'Cambodia', 'Laos', 'Germany', 'Haiti', 'Dominican-Republic', 'France', 'Guatemala', 'Italy', 'Hong', 'Columbia', 'Outlying-US(Guam-USVI-etc)', 'England', 'Peru', 'Portugal', 'Iran', 'El-Salvador'}
In [26]:
dummy = pd.get_dummies(adults[categorical_columns],drop_first=True)
dummy.head()
Out[26]:
Work_class_Federal-gov Work_class_Local-gov Work_class_Never-worked Work_class_Private Work_class_Self-emp-inc Work_class_Self-emp-not-inc Work_class_State-gov Work_class_Without-pay Education_11th Education_12th ... native_country_Portugal native_country_Puerto-Rico native_country_Scotland native_country_South native_country_Taiwan native_country_Thailand native_country_Trinadad&Tobago native_country_United-States native_country_Vietnam native_country_Yugoslavia
0 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
1 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 0 0 0 1 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 1 0 0
4 0 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 94 columns

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
adults[numerical_columns] = sc.fit_transform(adults[numerical_columns])
In [28]:
adults[numerical_columns].head()
Out[28]:
Age fnlwgt Education_num Capital_gain Capital_loss hpw
0 0.030671 -1.063611 1.134739 0.148453 -0.21666 -0.035429
1 0.837109 -1.008707 1.134739 -0.145920 -0.21666 -2.222153
2 -0.042642 0.245079 -0.420060 -0.145920 -0.21666 -0.035429
3 1.057047 0.425801 -1.197459 -0.145920 -0.21666 -0.035429
4 -0.775768 1.408176 1.134739 -0.145920 -0.21666 -0.035429
In [29]:
data = pd.concat([dummy,adults[numerical_columns],adults["Class"]], axis = 1)
In [30]:
X = data.iloc[: , :-1].values
y = data.iloc[:,-1].values
In [31]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

c1 = KNeighborsClassifier(n_neighbors=1)
c2 = RandomForestClassifier(random_state=1)
c3 = GaussianNB()
c4 = SVC()
lr = LogisticRegression()


for c , label in zip([c1,c2,c3,c4],["KNN","RandomForest","Naive Bayes","Support Vector Machine"]):
    scores = cross_val_score(c, X, y , cv = 3, scoring = "accuracy")
    print("accuracy {} +/- {} [{}]".format(scores.mean() , scores.std(), label))
accuracy 0.8014189494822662 +/- 0.0025677997025294486 [KNN]
accuracy 0.8462884685200729 +/- 0.0010725973236293852 [RandomForest]
accuracy 0.48693162620974345 +/- 0.03332009017352423 [Naive Bayes]
accuracy 0.8526766107806513 +/- 0.0038549657632717853 [Support Vector Machine]
In [32]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y ,test_size = 0.15 , random_state = 42)
In [34]:
from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators=100 , max_depth=2)
classifier.fit(X_train , y_train)
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
score = accuracy_score(y_test , y_pred)
score
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
Out[34]:
0.8607983623336745
In [35]:
x_score = cross_val_score(classifier , X, y , scoring="accuracy", cv = 10)
print(x_score.mean(),"+/-",x_score.std() )
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
0.8582048756826159 +/- 0.004796749054344141
C:\Users\Mostafa\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
In [45]:
from tpot import TPOTClassifier
tpot_classifier = TPOTClassifier(generations=5,population_size=60,verbosity=2,max_time_mins=2)
tpot_classifier.fit(X,y)
                                                                                                                       

                                                                                                                       
TPOT closed prematurely. Will use the current best pipeline.
                                                                                                                       
Best pipeline: GradientBoostingClassifier(RFE(input_matrix, RFE__ExtraTreesClassifier__criterion=entropy, RFE__ExtraTreesClassifier__max_features=0.35, RFE__ExtraTreesClassifier__n_estimators=100, RFE__step=0.6), GradientBoostingClassifier__learning_rate=0.1, GradientBoostingClassifier__max_depth=8, GradientBoostingClassifier__max_features=0.9, GradientBoostingClassifier__min_samples_leaf=1, GradientBoostingClassifier__min_samples_split=18, GradientBoostingClassifier__n_estimators=100, GradientBoostingClassifier__subsample=0.8)
Out[45]:
TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,
        0.95,  1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        generations=1000000, max_eval_time_mins=5, max_time_mins=2,
        mutation_rate=0.9, n_jobs=1, offspring_size=60, population_size=60,
        random_state=None, scoring=None, subsample=1.0, verbosity=2,
        warm_start=False)
In [47]:
tpot_classifier.score(X_test,y_test)
Out[47]:
0.91136131013306043
In [48]:
tpot_classifier.export("Adults_Ml.py")
In [50]:
# %load Adults_ML.py
import numpy as np

from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'class' in the data file
# tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
# features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_target, testing_target = \
    train_test_split(X, y, random_state=42)

exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.35000000000000003, n_estimators=100), step=0.6000000000000001),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=8, max_features=0.9000000000000001, min_samples_leaf=1, min_samples_split=18, n_estimators=100, subsample=0.8)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
results
Out[50]:
array([0, 0, 1, ..., 0, 1, 0], dtype=int64)
In [53]:
accuracy_score(testing_target, results)
Out[53]:
0.87483110183024193
In [54]:
cross_val_score(exported_pipeline, X, y, cv = 10 , scoring = "accuracy").mean()
Out[54]:
0.87153359537839115