Killers in Kaggle Competition
Killers in Kaggle Competition1. XGBoost Modelimport pandas as pd'''对比随机决策森林以及XGBoost模型对泰坦尼克号上的乘客是否生还的预测能力''''''*************************************************************************************
·
Killers in Kaggle Competition
1. XGBoost Model
import pandas as pd
'''
对比随机决策森林以及XGBoost模型对泰坦尼克号上的乘客是否生还的预测能力
'''
'''
***************************************************************
***************************************************************
'''
'''
随机森林对泰坦尼克号上的乘客是否生还的预测能力
'''
#通过URL地址下载Titanic数据
titanic = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt')
#选取pclass、age以及sex作为训练特征
X = titanic[['pclass','age','sex']]
y = titanic['survived']
#对缺失的age信息,采用平均值方法进行补全,即以age列已知数据的平均数填充
X['age'].fillna(X['age'].mean(),inplace=True)
#对原数据进行分割,随机采样25%作为测试集
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=33)
print(X_train)
print(y_train)
#从sklearn.feature_extraction导入DictVectorizer
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False)
#对原数据进行特征向量化处理
X_train = vec.fit_transform(X_train.to_dict(orient='record'))
X_test = vec.transform(X_test.to_dict(orient='record'))
print(X_train)
print(y_train)
#采用默认配置的随机森林分类器对测试集进行预测
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
print('the accuracy of RandomForestClassifier on training set:',rfc.score(X_test,y_test))
'''
XGBoost对泰坦尼克号上的乘客是否生还的预测能力
'''
from xgboost import XGBClassifier
xgbc = XGBClassifier()
xgbc.fit(X_train,y_train)
print('the accuracy of XGBoost on training set:',xgbc.score(X_test,y_test))
C:\Users\xxz\Anaconda3\lib\site-packages\pandas\core\generic.py:3191: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
self._update_inplace(new_data)
pclass age sex
1086 3rd 31.194181 male
12 1st 31.194181 female
1036 3rd 31.194181 male
833 3rd 32.000000 male
1108 3rd 31.194181 male
562 2nd 41.000000 male
437 2nd 48.000000 female
663 3rd 26.000000 male
669 3rd 19.000000 male
507 2nd 31.194181 male
1167 3rd 31.194181 male
821 3rd 9.000000 male
327 2nd 32.000000 female
715 3rd 21.000000 male
308 1st 31.194181 female
1274 3rd 31.194181 male
640 3rd 40.000000 male
72 1st 70.000000 male
1268 3rd 31.194181 male
1024 3rd 31.194181 male
1047 3rd 31.194181 female
940 3rd 31.194181 male
350 2nd 20.000000 female
892 3rd 31.194181 male
555 2nd 30.000000 female
176 1st 36.000000 male
107 1st 31.194181 female
475 2nd 34.000000 female
330 2nd 23.000000 male
533 2nd 34.000000 male
... ... ... ...
235 1st 24.000000 male
465 2nd 22.000000 female
210 1st 31.194181 male
579 2nd 40.000000 female
650 3rd 23.000000 male
1031 3rd 31.194181 male
99 1st 24.000000 female
969 3rd 31.194181 male
535 2nd 31.194181 male
403 2nd 31.194181 male
744 3rd 45.000000 male
344 2nd 26.000000 male
84 1st 31.194181 male
528 2nd 20.000000 male
1270 3rd 31.194181 male
662 3rd 40.000000 male
395 2nd 42.000000 male
1196 3rd 31.194181 male
543 2nd 23.000000 male
845 3rd 31.194181 male
813 3rd 25.000000 male
61 1st 31.194181 female
102 1st 23.000000 female
195 1st 28.000000 male
57 1st 27.000000 male
1225 3rd 31.194181 male
658 3rd 31.194181 female
578 2nd 12.000000 female
391 2nd 18.000000 male
1044 3rd 31.194181 female
[984 rows x 3 columns]
1086 0
12 1
1036 0
833 0
1108 0
562 0
437 1
663 0
669 0
507 0
1167 1
821 1
327 1
715 0
308 1
1274 0
640 0
72 0
1268 0
1024 0
1047 1
940 1
350 1
892 0
555 1
176 1
107 1
475 1
330 0
533 0
..
235 0
465 0
210 1
579 1
650 0
1031 0
99 1
969 0
535 0
403 0
744 1
344 0
84 1
528 0
1270 0
662 0
395 0
1196 0
543 0
845 0
813 0
61 1
102 1
195 0
57 1
1225 0
658 1
578 1
391 0
1044 0
Name: survived, dtype: int64
[[ 31.19418104 0. 0. 1. 0. 1. ]
[ 31.19418104 1. 0. 0. 1. 0. ]
[ 31.19418104 0. 0. 1. 0. 1. ]
...,
[ 12. 0. 1. 0. 1. 0. ]
[ 18. 0. 1. 0. 0. 1. ]
[ 31.19418104 0. 0. 1. 1. 0. ]]
1086 0
12 1
1036 0
833 0
1108 0
562 0
437 1
663 0
669 0
507 0
1167 1
821 1
327 1
715 0
308 1
1274 0
640 0
72 0
1268 0
1024 0
1047 1
940 1
350 1
892 0
555 1
176 1
107 1
475 1
330 0
533 0
..
235 0
465 0
210 1
579 1
650 0
1031 0
99 1
969 0
535 0
403 0
744 1
344 0
84 1
528 0
1270 0
662 0
395 0
1196 0
543 0
845 0
813 0
61 1
102 1
195 0
57 1
1225 0
658 1
578 1
391 0
1044 0
Name: survived, dtype: int64
the accuracy of RandomForestClassifier on training set: 0.775075987842
the accuracy of XGBoost on training set: 0.787234042553
2. TensorFlow Framework
2.1 Hello Google TensorFlow
import numpy as np
'''
使用Tensorflow输出一句话
'''
import tensorflow as tf
#初始化一个Tensorflow的常量: Hello Google Tensorflow! 字符串,并命名为greeting作为一个计算模块
greeting = tf.constant('Hello Google Tensorflow! ')
#
#启动一个会话
sess = tf.Session()
#使用会话执行greeting计算模块
result = sess.run(greeting)
#输出会话结果
print(result)
#关闭会话,这是一种显示关闭会话的方式
sess.close()
b'Hello Google Tensorflow! '
import tensorflow as tf
'''
使用Tensorflow完成一次线性函数的计算
'''
#声明matrix1为Tensorflow的一个1*2行向量
matrix1 = tf.constant([[3,3]])
#声明matrix2为Tensorflow的一个2*1列向量
matrix2 = tf.constant([[2],[2]])
#product将上述两个算子相乘,作为新算例
product = tf.matmul(matrix1,matrix2)
#继续将product与一个标量2.0求和拼接,作为最终的linear算例
linear = tf.add(product,tf.constant(2))
#直接在会话中执行linear算例,相当于将上面所有单独算例拼接成流程图来执行
with tf.Session() as sess:
result = sess.run(linear)
print(result)
[[14]]
2.2 Establishing Classifier
import numpy as np
import pandas as pd
import tensorflow as tf
'''
使用Tensorflow自定义一个线性分类器用于对“良/恶性乳腺癌肿瘤”进行预测
'''
#从本地使用pandas读取乳腺癌肿瘤的训练和测试数据
train = pd.read_csv('breast-cancer-train.csv')
test = pd.read_csv('breast-cancer-test.csv')
print(train)
print(test)
#分隔特征与分类目标
X_train = np.float32(train[['Clump Thickness','Cell Size']].T)
y_train = np.float32(train['Type'].T)
X_test = np.float32(test[['Clump Thickness','Cell Size']].T)
y_test =np.float32(test['Type'].T)
print(X_train)
print(X_train.shape)
print(X_test)
print(X_test.shape)
#定义一个tensorflow的变量b作为线性模型的截距,同时设置初始值1.0
b = tf.Variable(tf.zeros([1]))
#定义一个tensorflow的变量W作为线性模型的系数,并设置初始值为-1.0到1.0之间均匀分布随机函数
W = tf.Variable(tf.random_uniform([1,2],-1.0,1.0))
#显示定义这个线性函数
y = tf.matmul(W,X_train) + b
#使用tensorflow中的reduce_mean取得训练集上均方误差
loss = tf.reduce_mean(tf.square(y-y_train))
#使用梯度下降法估计W,b,并且设置迭代步长为0.01,这个与scikit-learn中SGDRegressor
optimizer = tf.train.GradientDescentOptimizer(0.01)
#以最小二乘损失为优化目标
train_optimizer = optimizer.minimize(loss)
#初始化所有变量
init = tf.initialize_all_variables()
#开启Tensorflow中的会话
sess = tf.Session()
#执行变量初始化操作
sess.run(init)
#迭代1000轮次,训练参数
for step in range(0,1000):
sess.run(train_optimizer)
if step % 10 == 0:
print(step,sess.run(W),sess.run(b))
# print(sess.run(W)[0][0],sess.run(W)[0][1])
#准备测试样本
test_negative = test.loc[test['Type'] == 0][['Clump Thickness','Cell Size']]
test_positive = test.loc[test['Type'] == 1][['Clump Thickness','Cell Size']]
#以最终更新的参数作图
import matplotlib.pyplot as plt
plt.scatter(test_negative['Clump Thickness'],test_negative['Cell Size'],marker='o',s=200,c='red')
plt.scatter(test_positive['Clump Thickness'],test_positive['Cell Size'],marker='x',s=150,c='black')
plt.xlabel('Clump Thickness')
plt.ylabel('Cell Size')
lx = np.arange(0,12)
#这里要强调一下,我们以0.5作为分界面,所以计算方式如下:
ly = (0.5 - sess.run(b) - lx * sess.run(W)[0][0])/sess.run(W)[0][1]
plt.plot(lx,ly,color='green')
plt.show()
Unnamed: 0 Clump Thickness Cell Size Type
0 163 1 1 0
1 286 10 10 1
2 612 10 10 1
3 517 1 1 0
4 464 1 1 0
5 277 1 1 0
6 408 3 2 0
7 104 10 10 1
8 114 3 2 0
9 627 1 1 0
10 545 1 1 0
11 467 6 6 1
12 92 1 1 0
13 7 1 2 0
14 89 1 1 0
15 528 1 3 0
16 380 1 1 0
17 521 1 1 0
18 539 1 1 0
19 363 4 4 0
20 638 1 1 0
21 140 1 1 0
22 28 1 1 0
23 43 6 5 1
24 42 10 10 1
25 73 4 5 1
26 167 8 10 1
27 210 10 10 1
28 610 4 3 1
29 66 1 1 0
.. ... ... ... ...
494 252 3 3 0
495 21 5 5 1
496 313 1 1 0
497 459 1 3 0
498 160 7 7 1
499 276 1 1 0
500 191 5 10 1
501 385 3 2 0
502 413 1 2 0
503 491 8 9 1
504 343 1 1 0
505 308 8 7 1
506 661 1 1 0
507 130 1 3 0
508 663 1 3 0
509 99 5 6 1
510 372 1 2 0
511 87 6 6 1
512 458 1 2 0
513 330 4 7 1
514 214 10 10 1
515 466 6 6 1
516 121 2 1 0
517 614 1 1 0
518 20 3 2 1
519 71 10 2 1
520 106 10 10 1
521 270 4 7 1
522 435 8 10 1
523 102 1 2 0
[524 rows x 4 columns]
Unnamed: 0 Clump Thickness Cell Size Type
0 158 1 2 0
1 499 1 1 0
2 396 1 1 0
3 155 5 5 1
4 321 1 1 0
5 212 1 1 0
6 234 3 2 0
7 289 6 6 1
8 300 4 10 1
9 356 3 3 1
10 672 1 1 0
11 328 10 3 1
12 199 1 1 0
13 78 1 1 0
14 598 1 1 0
15 569 10 8 1
16 446 1 1 0
17 506 10 10 1
18 626 6 6 1
19 603 4 6 1
20 360 10 10 1
21 338 1 1 0
22 668 7 4 1
23 290 1 1 0
24 284 4 5 1
25 331 1 1 0
26 477 1 1 0
27 54 5 5 1
28 248 1 1 0
29 223 5 6 1
.. ... ... ... ...
145 302 10 10 1
146 552 2 2 0
147 215 7 8 1
148 235 1 4 0
149 18 7 7 1
150 250 2 2 0
151 260 5 8 1
152 430 3 1 0
153 264 9 4 1
154 61 1 1 0
155 213 10 10 1
156 377 1 1 0
157 29 1 3 0
158 182 1 1 0
159 306 1 1 0
160 388 1 1 0
161 329 4 6 1
162 437 1 1 0
163 296 3 4 0
164 584 1 1 0
165 342 1 1 0
166 436 10 10 1
167 579 1 1 0
168 326 1 1 1
169 362 2 2 0
170 617 1 1 0
171 578 1 1 0
172 231 8 7 1
173 336 5 5 1
174 655 1 1 0
[175 rows x 4 columns]
[[ 1. 10. 10. ..., 4. 8. 1.]
[ 1. 10. 10. ..., 7. 10. 2.]]
(2, 524)
[[ 1. 1. 1. 5. 1. 1. 3. 6. 4. 3. 1. 10. 1. 1.
1. 10. 1. 10. 6. 4. 10. 1. 7. 1. 4. 1. 1. 5.
1. 5. 1. 1. 1. 5. 1. 1. 1. 10. 1. 10. 1. 3.
10. 1. 1. 1. 2. 4. 1. 1. 2. 1. 10. 1. 3. 1.
1. 6. 1. 1. 1. 1. 10. 3. 1. 1. 10. 6. 1. 2.
3. 1. 9. 1. 1. 1. 1. 3. 1. 1. 1. 1. 1. 1.
1. 10. 3. 1. 1. 1. 2. 1. 10. 1. 1. 10. 1. 1.
1. 1. 1. 1. 1. 4. 4. 1. 8. 1. 1. 5. 7. 3.
1. 3. 3. 1. 1. 1. 1. 1. 1. 7. 1. 1. 10. 3.
1. 3. 7. 4. 1. 1. 10. 1. 6. 1. 10. 1. 1. 3.
2. 3. 1. 1. 1. 10. 2. 7. 1. 7. 2. 5. 3. 9.
1. 10. 1. 1. 1. 1. 1. 4. 1. 3. 1. 1. 10. 1.
1. 2. 1. 1. 8. 5. 1.]
[ 2. 1. 1. 5. 1. 1. 2. 6. 10. 3. 1. 3. 1. 1.
1. 8. 1. 10. 6. 6. 10. 1. 4. 1. 5. 1. 1. 5.
1. 6. 1. 1. 1. 4. 1. 1. 1. 10. 1. 8. 1. 3.
10. 1. 1. 1. 2. 5. 4. 1. 2. 1. 10. 1. 4. 4.
1. 5. 1. 1. 1. 3. 4. 1. 1. 1. 8. 6. 1. 1.
3. 1. 9. 1. 1. 1. 1. 2. 2. 1. 1. 1. 1. 1.
1. 10. 5. 1. 1. 1. 1. 1. 10. 3. 3. 4. 1. 1.
1. 1. 1. 1. 1. 6. 2. 2. 7. 1. 1. 3. 7. 3.
1. 4. 1. 2. 1. 1. 1. 3. 1. 10. 1. 1. 3. 6.
1. 2. 4. 3. 1. 1. 10. 1. 7. 1. 7. 1. 1. 1.
1. 3. 1. 1. 3. 10. 2. 8. 4. 7. 2. 8. 1. 4.
1. 10. 1. 3. 1. 1. 1. 6. 1. 4. 1. 1. 10. 1.
1. 2. 1. 1. 7. 5. 1.]]
(2, 175)
WARNING:tensorflow:From <ipython-input-9-85653c33931d>:41: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.
Instructions for updating:
Use `tf.global_variables_initializer` instead.
0 [[ 0.36668363 -0.50090975]] [ 0.07310659]
10 [[ 0.42902589 -0.31664807]] [ 0.08392984]
20 [[ 0.37770221 -0.26446074]] [ 0.07397496]
30 [[ 0.33350319 -0.21931998]] [ 0.06422745]
40 [[ 0.29543743 -0.18026219]] [ 0.05475901]
50 [[ 0.26265171 -0.1464566 ]] [ 0.04562278]
60 [[ 0.23441158 -0.11718685]] [ 0.03685662]
70 [[ 0.21008505 -0.09183522]] [ 0.02848593]
80 [[ 0.18912806 -0.06986891]] [ 0.02052602]
90 [[ 0.17107238 -0.05082829]] [ 0.01298404]
100 [[ 0.15551494 -0.03431684]] [ 0.00586066]
110 [[ 0.14210881 -0.01999237]] [-0.00084857]
120 [[ 0.13055533 -0.00755955]] [-0.00715207]
130 [[ 0.12059743 0.00323657]] [-0.0130613]
140 [[ 0.11201376 0.01261609]] [-0.01858996]
150 [[ 0.10461382 0.02076911]] [-0.02375336]
160 [[ 0.09823353 0.02785983]] [-0.02856789]
170 [[ 0.09273166 0.03403014]] [-0.03305059]
180 [[ 0.0879866 0.03940263]] [-0.03721882]
190 [[ 0.08389364 0.0440833 ]] [-0.04109001]
200 [[ 0.08036258 0.0481638 ]] [-0.04468136]
210 [[ 0.07731578 0.05172339]] [-0.04800977]
220 [[ 0.07468636 0.05483068]] [-0.05109166]
230 [[ 0.07241672 0.05754501]] [-0.05394287]
240 [[ 0.07045723 0.05991777]] [-0.05657862]
250 [[ 0.06876517 0.06199349]] [-0.05901347]
260 [[ 0.06730371 0.06381072]] [-0.06126123]
270 [[ 0.06604112 0.06540291]] [-0.06333503]
280 [[ 0.06495008 0.06679903]] [-0.06524725]
290 [[ 0.06400704 0.06802423]] [-0.06700955]
300 [[ 0.06319169 0.06910033]] [-0.06863291]
310 [[ 0.06248654 0.07004629]] [-0.07012761]
320 [[ 0.06187651 0.07087857]] [-0.07150328]
330 [[ 0.06134861 0.07161149]] [-0.07276891]
340 [[ 0.06089162 0.07225746]] [-0.07393289]
350 [[ 0.06049588 0.07282734]] [-0.07500301]
360 [[ 0.06015305 0.07333054]] [-0.07598653]
370 [[ 0.05985595 0.07377529]] [-0.07689022]
380 [[ 0.05959837 0.07416874]] [-0.07772031]
390 [[ 0.05937495 0.07451714]] [-0.07848261]
400 [[ 0.05918108 0.07482593]] [-0.07918249]
410 [[ 0.05901278 0.07509987]] [-0.07982493]
420 [[ 0.05886659 0.07534314]] [-0.08041453]
430 [[ 0.05873957 0.07555936]] [-0.0809555]
440 [[ 0.05862912 0.07575173]] [-0.08145178]
450 [[ 0.05853304 0.07592303]] [-0.08190699]
460 [[ 0.05844941 0.07607572]] [-0.08232445]
470 [[ 0.05837657 0.07621194]] [-0.08270727]
480 [[ 0.05831309 0.07633358]] [-0.08305824]
490 [[ 0.05825774 0.07644231]] [-0.08337997]
500 [[ 0.05820943 0.07653957]] [-0.08367487]
510 [[ 0.05816725 0.07662665]] [-0.08394514]
520 [[ 0.05813038 0.07670469]] [-0.08419282]
530 [[ 0.05809814 0.07677467]] [-0.08441976]
540 [[ 0.05806994 0.07683749]] [-0.0846277]
550 [[ 0.05804524 0.07689392]] [-0.08481821]
560 [[ 0.05802359 0.07694465]] [-0.08499274]
570 [[ 0.0580046 0.0769903]] [-0.08515258]
580 [[ 0.05798792 0.0770314 ]] [-0.085299]
590 [[ 0.05797327 0.07706842]] [-0.08543309]
600 [[ 0.05796038 0.0771018 ]] [-0.0855559]
610 [[ 0.05794904 0.07713193]] [-0.08566836]
620 [[ 0.05793905 0.07715912]] [-0.08577135]
630 [[ 0.05793025 0.07718369]] [-0.08586565]
640 [[ 0.05792246 0.0772059 ]] [-0.08595198]
650 [[ 0.05791559 0.07722599]] [-0.08603103]
660 [[ 0.05790952 0.07724417]] [-0.08610339]
670 [[ 0.05790414 0.07726063]] [-0.08616965]
680 [[ 0.05789939 0.07727554]] [-0.0862303]
690 [[ 0.05789516 0.07728906]] [-0.08628582]
700 [[ 0.05789141 0.07730132]] [-0.08633664]
710 [[ 0.05788808 0.07731244]] [-0.08638318]
720 [[ 0.05788512 0.07732254]] [-0.08642577]
730 [[ 0.05788249 0.0773317 ]] [-0.08646475]
740 [[ 0.05788014 0.07734002]] [-0.08650042]
750 [[ 0.05787804 0.07734759]] [-0.08653308]
760 [[ 0.05787617 0.07735448]] [-0.08656296]
770 [[ 0.0578745 0.07736073]] [-0.08659032]
780 [[ 0.057873 0.07736642]] [-0.08661535]
790 [[ 0.05787166 0.0773716 ]] [-0.08663826]
800 [[ 0.05787047 0.07737631]] [-0.08665923]
810 [[ 0.05786939 0.07738061]] [-0.08667842]
820 [[ 0.05786842 0.07738452]] [-0.08669598]
830 [[ 0.05786756 0.07738808]] [-0.08671205]
840 [[ 0.05786679 0.07739132]] [-0.08672676]
850 [[ 0.05786609 0.07739428]] [-0.08674022]
860 [[ 0.05786546 0.07739697]] [-0.08675253]
870 [[ 0.05786489 0.07739943]] [-0.0867638]
880 [[ 0.05786439 0.07740167]] [-0.08677411]
890 [[ 0.05786392 0.07740371]] [-0.08678355]
900 [[ 0.05786351 0.07740557]] [-0.08679216]
910 [[ 0.05786314 0.07740727]] [-0.08680007]
920 [[ 0.0578628 0.07740883]] [-0.0868073]
930 [[ 0.05786249 0.07741025]] [-0.08681391]
940 [[ 0.05786222 0.07741154]] [-0.08681997]
950 [[ 0.05786196 0.07741272]] [-0.0868255]
960 [[ 0.05786174 0.0774138 ]] [-0.08683058]
970 [[ 0.05786153 0.07741478]] [-0.08683521]
980 [[ 0.05786135 0.07741567]] [-0.08683946]
990 [[ 0.05786117 0.0774165 ]] [-0.08684334]
更多推荐
所有评论(0)