import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)
df=pd.read_csv('data.csv')
from sklearn.preprocessing import Imputer,LabelEncoder,OneHotEncoder
x=df.iloc[:,:3].values
y=df.iloc[:,3].values
ob=Imputer()
x[:,1:]=ob.fit_transform(x[:,1:])
print(x)
le=LabelEncoder()
x[:,0]=le.fit_transform(x[:,0])
print(x)
ohe=OneHotEncoder(categorical_features=[0])
x=ohe.fit_transform(x).toarray()
print(x)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
print(x_train)
Dataset:
country,age,salary,purchase
france,44,72000,no
spain,27,48000,yes
germany,30,54000,no
spain,38,61000,no
germany,40,nan,yes
france,35,58000,yes
spain,nan,52000,no
france,48,79000,yes
germany,50,83000,no
france,37,67000,yes
Output:
[['france' 44.0 72000.0]
['spain' 27.0 48000.0]
['germany' 30.0 54000.0]
['spain' 38.0 61000.0]
['germany' 40.0 63777.77777777778]
['france' 35.0 58000.0]
['spain' 38.77777777777778 52000.0]
['france' 48.0 79000.0]
['germany' 50.0 83000.0]
['france' 37.0 67000.0]]
[[0 44.0 72000.0]
[2 27.0 48000.0]
[1 30.0 54000.0]
[2 38.0 61000.0]
[1 40.0 63777.77777777778]
[0 35.0 58000.0]
[2 38.77777777777778 52000.0]
[0 48.0 79000.0]
[1 50.0 83000.0]
[0 37.0 67000.0]]
[[ 1. 0. 0. 44.
72000. ]
[ 0. 0. 1. 27.
48000. ]
[ 0. 1. 0. 30.
54000. ]
[ 0. 0. 1. 38.
61000. ]
[ 0. 1. 0. 40.
63777.77777778]
[ 1. 0. 0. 35.
58000. ]
[ 0. 0. 1. 38.77777778
52000. ]
[ 1. 0. 0. 48.
79000. ]
[ 0. 1. 0. 50.
83000. ]
[ 1. 0. 0. 37.
67000. ]]
[[ 0. 1. 0. 30.
54000. ]
[ 0. 1. 0. 40.
63777.77777778]
[ 0. 1. 0. 50.
83000. ]
[ 1. 0. 0. 37.
67000. ]
[ 0. 0. 1. 38.77777778
52000. ]
[ 1. 0. 0. 44.
72000. ]
[ 0. 0. 1. 27.
48000. ]
[ 1. 0. 0. 48.
79000. ]]
import numpy as np
np.set_printoptions(suppress=True)
df=pd.read_csv('data.csv')
from sklearn.preprocessing import Imputer,LabelEncoder,OneHotEncoder
x=df.iloc[:,:3].values
y=df.iloc[:,3].values
ob=Imputer()
x[:,1:]=ob.fit_transform(x[:,1:])
print(x)
le=LabelEncoder()
x[:,0]=le.fit_transform(x[:,0])
print(x)
ohe=OneHotEncoder(categorical_features=[0])
x=ohe.fit_transform(x).toarray()
print(x)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
print(x_train)
Dataset:
country,age,salary,purchase
france,44,72000,no
spain,27,48000,yes
germany,30,54000,no
spain,38,61000,no
germany,40,nan,yes
france,35,58000,yes
spain,nan,52000,no
france,48,79000,yes
germany,50,83000,no
france,37,67000,yes
Output:
[['france' 44.0 72000.0]
['spain' 27.0 48000.0]
['germany' 30.0 54000.0]
['spain' 38.0 61000.0]
['germany' 40.0 63777.77777777778]
['france' 35.0 58000.0]
['spain' 38.77777777777778 52000.0]
['france' 48.0 79000.0]
['germany' 50.0 83000.0]
['france' 37.0 67000.0]]
[[0 44.0 72000.0]
[2 27.0 48000.0]
[1 30.0 54000.0]
[2 38.0 61000.0]
[1 40.0 63777.77777777778]
[0 35.0 58000.0]
[2 38.77777777777778 52000.0]
[0 48.0 79000.0]
[1 50.0 83000.0]
[0 37.0 67000.0]]
[[ 1. 0. 0. 44.
72000. ]
[ 0. 0. 1. 27.
48000. ]
[ 0. 1. 0. 30.
54000. ]
[ 0. 0. 1. 38.
61000. ]
[ 0. 1. 0. 40.
63777.77777778]
[ 1. 0. 0. 35.
58000. ]
[ 0. 0. 1. 38.77777778
52000. ]
[ 1. 0. 0. 48.
79000. ]
[ 0. 1. 0. 50.
83000. ]
[ 1. 0. 0. 37.
67000. ]]
[[ 0. 1. 0. 30.
54000. ]
[ 0. 1. 0. 40.
63777.77777778]
[ 0. 1. 0. 50.
83000. ]
[ 1. 0. 0. 37.
67000. ]
[ 0. 0. 1. 38.77777778
52000. ]
[ 1. 0. 0. 44.
72000. ]
[ 0. 0. 1. 27.
48000. ]
[ 1. 0. 0. 48.
79000. ]]
No comments:
Post a Comment