数据示例:
,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Embarked_C,Embarked_Q,Embarked_S,Embarked_U 0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S,0,0,1,0 1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",0,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0,0 2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1,0 3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S,0,0,1,0 4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S,0,0,1,0 5,6,0,3,"Moran, Mr. James",1,23.8011805916,0,0,330877,8.4583,,Q,0,1,0,0 6,7,0,1,"McCarthy, Mr. Timothy J",1,54.0,0,0,17463,51.8625,E46,S,0,0,1,0 7,8,0,3,"Palsson, Master. Gosta Leonard",1,2.0,3,1,349909,21.075,,S,0,0,1,0 8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",0,27.0,0,2,347742,11.1333,,S,0,0,1,0 9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1,0,237736,30.0708,,C,1,0,0,0 10,11,1,3,"Sandstrom, Miss. Marguerite Rut",0,4.0,1,1,PP 9549,16.7,G6,S,0,0,1,0 11,12,1,1,"Bonnell, Miss. Elizabeth",0,58.0,0,0,113783,26.55,C103,S,0,0,1,0 12,13,0,3,"Saundercock, Mr. William Henry",1,20.0,0,0,A/5. 2151,8.05,,S,0,0,1,0 13,14,0,3,"Andersson, Mr. Anders Johan",1,39.0,1,5,347082,31.275,,S,0,0,1,0 14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",0,14.0,0,0,350406,7.8542,,S,0,0,1,0 15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",0,55.0,0,0,248706,16.0,,S,0,0,1,0 16,17,0,3,"Rice, Master. Eugene",1,2.0,4,1,382652,29.125,,Q,0,1,0,0 17,18,1,2,"Williams, Mr. Charles Eugene",1,33.478692644,0,0,244373,13.0,,S,0,0,1,0 18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",0,31.0,1,0,345763,18.0,,S,0,0,1,0 19,20,1,3,"Masselmani, Mrs. Fatima",0,18.4510583333,0,0,2649,7.225,,C,1,0,0,0 20,21,0,2,"Fynney, Mr. Joseph J",1,35.0,0,0,239865,26.0,,S,0,0,1,0 21,22,1,2,"Beesley, Mr. Lawrence",1,34.0,0,0,248698,13.0,D56,S,0,0,1,0 22,23,1,3,"McGowan, Miss. Anna ""Annie""",0,15.0,0,0,330923,8.0292,,Q,0,1,0,0 23,24,1,1,"Sloper, Mr. William Thompson",1,28.0,0,0,113788,35.5,A6,S,0,0,1,0 24,25,0,3,"Palsson, Miss. Torborg Danira",0,8.0,3,1,349909,21.075,,S,0,0,1,0 25,26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",0,38.0,1,5,347077,31.3875,,S,0,0,1,0 26,27,0,3,"Emir, Mr. Farred Chehab",1,34.8922936994,0,0,2631,7.225,,C,1,0,0,0 27,28,0,1,"Fortune, Mr. Charles Alexander",1,19.0,3,2,19950,263.0,C23 C25 C27,S,0,0,1,0 28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",0,22.8110194444,0,0,330959,7.8792,,Q,0,1,0,0 29,30,0,3,"Todoroff, Mr. Lalio",1,27.8541556913,0,0,349216,7.8958,,S,0,0,1,0 30,31,0,1,"Uruchurtu, Don. Manuel E",1,40.0,0,0,PC 17601,27.7208,,C,1,0,0,0 31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",0,38.0680685714,1,0,PC 17569,146.5208,B78,C,1,0,0,0 32,33,1,3,"Glynn, Miss. Mary Agatha",0,22.2371852543,0,0,335677,7.75,,Q,0,1,0,0 33,34,0,2,"Wheadon, Mr. Edward H",1,66.0,0,0,C.A. 24579,10.5,,S,0,0,1,0 34,35,0,1,"Meyer, Mr. Edgar Joseph",1,28.0,1,0,PC 17604,82.1708,,C,1,0,0,0 35,36,0,1,"Holverson, Mr. Alexander Oskar",1,42.0,1,0,113789,52.0,,S,0,0,1,0
# /usr/bin/python # -*- encoding:utf-8 -*- import xgboost as xgb import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier import pandas as pd import csv def show_accuracy(a, b, tip): acc = a.ravel() == b.ravel() acc_rate = 100 * float(acc.sum()) / a.size return acc_rate def load_data(file_name, is_train): data = pd.read_csv(file_name) # 数据文件路径 # print 'data.describe() = \n', data.describe() # 性别 将性别字段Sex中的值 female用0,male用1代替,类型 int data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int) # 补齐船票价格缺失值 if len(data.Fare[data.Fare.isnull()]) > 0: fare = np.zeros(3) for f in range(0, 3): fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median() for f in range(0, 3): # loop 0 to 2 data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f] # 年龄:使用均值代替缺失值 # mean_age = data['Age'].dropna().mean() # data.loc[(data.Age.isnull()), 'Age'] = mean_age if is_train: # 年龄:使用随机森林预测年龄缺失值 print '随机森林预测缺失年龄:--start--' data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']] age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据 age_null = data_for_age.loc[(data.Age.isnull())] # print 'data_for_age=\n', data_for_age # print 'age_exis=\n', age_exist # print 'age_null=\n',age_null # print age_exist x = age_exist.values[:, 1:] y = age_exist.values[:, 0] # print 'x = age_exist.values[:, 1:] 中 x=',x # print 'y = age_exist.values[:, 0] 中 y=',y #n_estimators 决策树的个数,越多越好,值越大,性能就会越差,但至少100 rfr = RandomForestRegressor(n_estimators=1000) rfr.fit(x, y) age_hat = rfr.predict(age_null.values[:, 1:]) # print age_hat # print 'age_hat',age_hat #填充年龄字段中值为空的 data.loc[(data.Age.isnull()), 'Age'] = age_hat print '随机森林预测缺失年龄:--over--' else: print '随机森林预测缺失年龄2:--start--' data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']] age_exist = data_for_age.loc[(data.Age.notnull())] # 年龄不缺失的数据 age_null = data_for_age.loc[(data.Age.isnull())] # print age_exist x = age_exist.values[:, 1:] y = age_exist.values[:, 0] rfr = RandomForestRegressor(n_estimators=1000) rfr.fit(x, y) age_hat = rfr.predict(age_null.values[:, 1:]) # print age_hat data.loc[(data.Age.isnull()), 'Age'] = age_hat print '随机森林预测缺失年龄2:--over--' # 起始城市 data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S' # 保留缺失出发城市 # print data['Embarked'] embarked_data = pd.get_dummies(data.Embarked) # print embarked_data embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x)) data = pd.concat([data, embarked_data], axis=1) # print data.describe() data.to_csv('New_Data.csv') x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']] # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']] y = None if 'Survived' in data: y = data['Survived'] x = np.array(x) y = np.array(y) x = np.tile(x, (5, 1)) y = np.tile(y, (5, )) if is_train: return x, y return x, data['PassengerId'] def write_result(c, c_type): file_name = '14.Titanic.test.csv' x, passenger_id = load_data(file_name, False) if type == 3: x = xgb.DMatrix(x) y = c.predict(x) y[y > 0.5] = 1 y[~(y > 0.5)] = 0 predictions_file = open("Prediction_%d.csv" % c_type, "wb") open_file_object = csv.writer(predictions_file) open_file_object.writerow(["PassengerId", "Survived"]) open_file_object.writerows(zip(passenger_id, y)) predictions_file.close() def totalSurvival(y_hat,tip): total=0 for index,value in enumerate(y_hat): if value==1: total=total+1 print tip,'存活:',total print '人' if __name__ == "__main__": #加载并完善特征数据 x, y = load_data('14.Titanic.train.csv', True) #划分训练集和测试集x表示样本特征集,y表示样本结果 test_size 样本占比,random_state 随机数的种子 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=1) #print 'x_train=',x_train,'y_train=',y_train #logistic回归 lr = LogisticRegression(penalty='l2') lr.fit(x_train, y_train) y_hat = lr.predict(x_test) lr_rate = show_accuracy(y_hat, y_test, 'Logistic回归 ') totalSurvival(y_hat,'Logistic回归') #随机森林 n_estimators:决策树的个数,越多越好,不过值越大,性能就会越差,至少100 rfc = RandomForestClassifier(n_estimators=100) rfc.fit(x_train, y_train) y_hat = rfc.predict(x_test) rfc_rate = show_accuracy(y_hat, y_test, '随机森林 ') totalSurvival(y_hat,'随机森林') # write_result(rfc, 2) # XGBoost data_train = xgb.DMatrix(x_train, label=y_train) data_test = xgb.DMatrix(x_test, label=y_test) watch_list = [(data_test, 'eval'), (data_train, 'train')] param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'} bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list) y_hat = bst.predict(data_test) y_hat[y_hat > 0.5] = 1 y_hat[~(y_hat > 0.5)] = 0 xgb_rate = show_accuracy(y_hat, y_test, 'XGBoost ') totalSurvival(y_hat,'xgboost') print 'Logistic回归:%.3f%%' % lr_rate print '随机森林:%.3f%%' % rfc_rate print 'XGBoost:%.3f%%' % xgb_rate
结果:
Logistic回归 存活: 813人 随机森林 存活: 859人 xgboost 存活: 872人 准确率: Logistic回归:78.770% 随机森林:98.160% XGBoost:97.935%