|
??????Python???????????????????????????????
sklearn
pandas
numpy
??????????????????Python??????????
from sklearn.datasets import load_irisfrom sklearn.ensemble import RandomForestClassifierimport pandas as pdimport numpy as npiris = load_iris() # ???sklearn?????????df = pd.DataFrame(iris.data, columns=iris.feature_names) # ?????print (df) # df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) ## ??? ??df.head()train, test = df[df['is_train']==True], df[df['is_train']==False]features = df.columns[:4]clf = RandomForestClassifier(n_jobs=2)y, _ = pd.factorize(train['species'])clf.fit(train[features], y) # ?train?????test_pred=clf.predict(test[features]) #?????????preds = iris.target_names[test_pred]pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
???????sklearn?????
kaggle-????????????????kaggle?????????????????????????????????????????????????????????????
??????
import pandas as pd # load csv's (pd.read_csv)import numpy as np # math (lin. algebra)import sklearn as skl # machine learningfrom sklearn.ensemble import RandomForestClassifier#from plotnine import *import matplotlib.pyplot as pltfrom sklearn.preprocessing import LabelEncoderfrom sklearn_pandas import DataFrameMapperfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.metrics import classification_report
????
def get_train_data():#????????????????????? train_path = "D:/workspace/Data/kggal/AmericaIncome/adult.data" test_path = 'D:/workspace/Data/kggal/AmericaIncome/adult.test' columns = ['Age','Workclass','fnlgwt','Education','EdNum','MaritalStatus','Occupation','Relationship','Race','Sex','CapitalGain','CapitalLoss','HoursPerWeek','Country','Income'] df_train_set = pd.read_csv(train_path, names=columns) #print(df_train_set.head()) return df_train_setdef get_test_data(): test_path = 'D:/workspace/Data/kggal/AmericaIncome/adult.test' columns = ['Age','Workclass','fnlgwt','Education','EdNum','MaritalStatus','Occupation','Relationship','Race','Sex','CapitalGain','CapitalLoss','HoursPerWeek','Country','Income'] df_test_set = pd.read_csv(test_path, names=columns) #print(df_test_set.head()) return df_test_set
?????
df_train_set=get_train_data()df_test_set=get_test_data()df_train_set.drop('fnlgwt', axis=1, inplace=True)df_test_set.drop('fnlgwt', axis=1, inplace=True)print(df_train_set.replace(' ?', np.nan).shape)#(32561, 14)print(df_train_set.replace(' ?', np.nan).dropna().shape)# (30162, 15)print(df_test_set.replace(' ?', np.nan).shape)#(16282, 14)print(df_test_set.replace(' ?', np.nan).dropna().shape)# (15060, 15)# ??????????train_set = df_train_set.replace(' ?', np.nan).dropna()test_set = df_test_set.replace(' ?', np.nan).dropna()# ???????? Income ???test_set['Income'] = test_set.Income.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})print(test_set.Income.unique())# [' <=50K' ' >50K']print(df_train_set.Income.unique())# [' <=50K' ' >50K']# ??? ???????????????????train_set.drop(["Education"], axis=1, inplace=True)test_set.drop(["Education"], axis=1, inplace=True)- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
???????????????????????
combined_set = pd.concat([train_set, test_set], axis=0)for feature in combined_set.columns: if combined_set[feature].dtype == 'object': combined_set[feature] = pd.Categorical(combined_set[feature]).codestrain_set = combined_set[:train_set.shape[0]]test_set = combined_set[test_set.shape[0]:]print(train_set.Workclass.unique())print(test_set.Income.unique())cols = list(train_set.columns)cols.remove("Income")x_train, y_train = train_set[cols].values, train_set["Income"].values#????????????????????????x_test, y_test = test_set[cols].values, test_set["Income"].values
????
treeClassifier = DecisionTreeClassifier()treeClassifier.fit(x_train, y_train) # ????treeClassifier.score(x_test, y_test)print(treeClassifier.score(x_test, y_test))# ????0.8700351435581195y_pred = treeClassifier.predict(x_test) # ???????print(classification_report(y_test, y_pred)) # ????????????#??? precision recall f1-score support 0 0.90 0.93 0.92 22674 1 0.77 0.67 0.72 7488avg / total 0.87 0.87 0.87 30162??????????????????????????????????????????????????87%??????????????????3???????1???????????????????????????????
???????????????????
????
n??????????
??1 ??2 ??3 ??4 ... ??n ????
????
???????n???
??1 ??2 ??3 ??4 ... ??n ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????>???????o2o????????
??????????
???????????????????????????????????????? MARS ??????
??????????????????
?????????????????????????? |
|