请选择 进入手机版 | 继续访问电脑版

集云开发者论坛

 找回密码
 立即注册
搜索
热搜: 活动 交友 discuz
查看: 13772|回复: 0

????????

[复制链接]

463

主题

477

帖子

1万

积分

管理员

Rank: 9Rank: 9Rank: 9

积分
13928
发表于 2018-8-6 11:16:57 | 显示全部楼层 |阅读模式
??????Python???????????????????????????????

sklearn
pandas
numpy
??????????????????Python??????????
from sklearn.datasets import load_irisfrom sklearn.ensemble import RandomForestClassifierimport pandas as pdimport numpy as npiris = load_iris()   # ???sklearn?????????df = pd.DataFrame(iris.data, columns=iris.feature_names) # ?????print (df)          # df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)  ## ??? ??df.head()train, test = df[df['is_train']==True], df[df['is_train']==False]features = df.columns[:4]clf = RandomForestClassifier(n_jobs=2)y, _ = pd.factorize(train['species'])clf.fit(train[features], y)  # ?train?????test_pred=clf.predict(test[features])   #?????????preds = iris.target_names[test_pred]pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
???????sklearn?????
kaggle-????????????????kaggle?????????????????????????????????????????????????????????????

??????
import pandas as pd  # load csv's (pd.read_csv)import numpy as np   # math (lin. algebra)import sklearn as skl   # machine learningfrom sklearn.ensemble import RandomForestClassifier#from plotnine import *import matplotlib.pyplot as pltfrom sklearn.preprocessing import LabelEncoderfrom sklearn_pandas import DataFrameMapperfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.metrics import classification_report
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12

????
def get_train_data():#?????????????????????    train_path = "D:/workspace/Data/kggal/AmericaIncome/adult.data"    test_path = 'D:/workspace/Data/kggal/AmericaIncome/adult.test'    columns = ['Age','Workclass','fnlgwt','Education','EdNum','MaritalStatus','Occupation','Relationship','Race','Sex','CapitalGain','CapitalLoss','HoursPerWeek','Country','Income']    df_train_set = pd.read_csv(train_path, names=columns)    #print(df_train_set.head())    return df_train_setdef get_test_data():    test_path = 'D:/workspace/Data/kggal/AmericaIncome/adult.test'    columns = ['Age','Workclass','fnlgwt','Education','EdNum','MaritalStatus','Occupation','Relationship','Race','Sex','CapitalGain','CapitalLoss','HoursPerWeek','Country','Income']    df_test_set = pd.read_csv(test_path, names=columns)    #print(df_test_set.head())    return df_test_set
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15

?????
df_train_set=get_train_data()df_test_set=get_test_data()df_train_set.drop('fnlgwt', axis=1, inplace=True)df_test_set.drop('fnlgwt', axis=1, inplace=True)print(df_train_set.replace(' ?', np.nan).shape)#(32561, 14)print(df_train_set.replace(' ?', np.nan).dropna().shape)# (30162, 15)print(df_test_set.replace(' ?', np.nan).shape)#(16282, 14)print(df_test_set.replace(' ?', np.nan).dropna().shape)# (15060, 15)# ??????????train_set = df_train_set.replace(' ?', np.nan).dropna()test_set = df_test_set.replace(' ?', np.nan).dropna()# ???????? Income ???test_set['Income'] = test_set.Income.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})print(test_set.Income.unique())# [' <=50K' ' >50K']print(df_train_set.Income.unique())# [' <=50K' ' >50K']# ??? ???????????????????train_set.drop(["Education"], axis=1, inplace=True)test_set.drop(["Education"], axis=1, inplace=True)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30

???????????????????????
combined_set = pd.concat([train_set, test_set], axis=0)for feature in combined_set.columns:    if combined_set[feature].dtype == 'object':        combined_set[feature] = pd.Categorical(combined_set[feature]).codestrain_set = combined_set[:train_set.shape[0]]test_set = combined_set[test_set.shape[0]:]print(train_set.Workclass.unique())print(test_set.Income.unique())cols = list(train_set.columns)cols.remove("Income")x_train, y_train = train_set[cols].values, train_set["Income"].values#????????????????????????x_test, y_test = test_set[cols].values, test_set["Income"].values
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16

????
treeClassifier = DecisionTreeClassifier()treeClassifier.fit(x_train, y_train) # ????treeClassifier.score(x_test, y_test)print(treeClassifier.score(x_test, y_test))# ????0.8700351435581195y_pred = treeClassifier.predict(x_test)  # ???????print(classification_report(y_test, y_pred)) # ????????????#???             precision    recall  f1-score   support          0       0.90      0.93      0.92     22674          1       0.77      0.67      0.72      7488avg / total       0.87      0.87      0.87     30162
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
??????????????????????????????????????????????????87%??????????????????3???????1???????????????????????????????
???????????????????

????
n??????????
??1 ??2 ??3 ??4 ... ??n ????
  • 1

????
???????n???
??1 ??2 ??3 ??4 ... ??n
  • 1
???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????>???????o2o????????
??????????

???????????????????????????????????????? MARS ??????
??????????????????
??????????????????????????
回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|Archiver|手机版|小黑屋|集云开发者论坛 ( 辽ICP备15012805号-2 )

GMT+8, 2019-12-8 16:20 , Processed in 0.060003 second(s), 21 queries .

Powered by Open Draft System

© 2007-2016 OpenDraft

快速回复 返回顶部 返回列表