signed

QiShunwang

“诚信为本、客户至上”

集成学习-xgboost学习

2021/4/26 14:10:40   来源:

XGboost相关学习

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
#波士顿数据集非常简单,但它所涉及到的问题却很多
data
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),
 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
 'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data
y = data.target
X.shape
(506, 13)
y.shape
(506,)
y
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #训练
reg.predict(Xtest) #传统接口predict
reg.score(Xtest,Ytest) #你能想出这里应该返回什么模型评估指标么?利用shift+Tab可以知道,R^2评估指标
y.mean()
MSE(Ytest,reg.predict(Xtest))#可以看出均方误差是平均值y.mean()的1/3左右,结果不算好也不算坏
reg.feature_importances_ #树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
#xgboost可以使用嵌入法进行特征选择
reg = XGBR(n_estimators=100) #交叉验证中导入的没有经过训练的模型
CVS(reg,Xtrain,Ytrain,cv=5).mean()
#这里应该返回什么模型评估指标,还记得么? 返回的是与reg.score相同的评估指标R^2(回归),准确率(分类)
0.8017863029875325
#严谨的交叉验证与不严谨的交叉验证之间的讨论:训练集 or 全数据?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#严谨 vs 不严谨
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
-16.041115480238048
#来查看一下sklearn中所有的模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']
#使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329
0.7975497480638329
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033
-16.998723616338033
lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085
0.6835070597278085
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844
-25.34950749364844
#如果开启参数slient:在数据巨大,预料到算法运行会非常缓慢的时候可以使用这个参数来监控模型的训练进度
reg = XGBR(n_estimators=10,silent=True)#xgboost库silent=True不会打印训练进程,只返回运行结果,默认是False会打印训练进程
#sklearn库中的xgbsoost的默认为silent=True不会打印训练进程,想打印需要手动设置为False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579
-92.67865836936579
def plot_learning_curve(estimator,title, X, y, 
                        ax=None, #选择子图
                        ylim=None, #设置纵坐标的取值范围
                        cv=None, #交叉验证
                        n_jobs=None #设定索要使用的线程
                       ):
    
    from sklearn.model_selection import learning_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
                                                            ,shuffle=True
                                                            ,cv=cv
                                                            ,random_state=420
                                                            ,n_jobs=n_jobs)      
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #绘制网格,不是必须
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
            , color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
            , color="g",label="Test score")
    ax.legend(loc="best")
    return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #交叉验证模式
plot_learning_curve(XGBR(n_estimators=100,random_state=420)
                    ,"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xpQh3MDR-1619417048279)(output_28_0.png)]

#=====【TIME WARNING:25 seconds】=====#

axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6da4aVib-1619417048281)(output_29_1.png)]

#选出来的n_estimators非常不寻常,我们是否要选择准确率最高的n_estimators值呢?
#======【TIME WARNING: 20s】=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    #记录1-偏差
    rs.append(cvresult.mean())
    #记录方差
    var.append(cvresult.var())
    #计算泛化误差的可控部分
    ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eJ1DzHzL-1619417048282)(output_31_1.png)]

axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qD6ST091-1619417048283)(output_32_1.png)]

#看看泛化误差的可控部分如何?
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pxH53h8Q-1619417048284)(output_33_0.png)]

#验证模型效果是否提高了?
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9197580267581366
0.0787498950958252
time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9208745746309475
0.36807847023010254
time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9231068620728082
0.12366437911987305
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
0.7368421052631579 0.837609040251761

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-7svP0syk-1619417048285)(output_37_1.png)]

#继续细化学习曲线
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qvvhiCYL-1619417048286)(output_38_1.png)]

#细化学习曲线
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jULrwpAL-1619417048286)(output_39_1.png)]

reg = XGBR(n_estimators=180
         #  ,subsample=0.7708333333333334
           ,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)
0.9159462982185405
MSE(Ytest,reg.predict(Xtest))
7.821523502888769
#首先我们先来定义一个评分函数,这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(scoring[i] #模型评估指标的名字
                                     ,CVS(reg
                                          ,Xtrain,Ytrain
                                          ,cv=cv,scoring=scoring[i]).mean()))
        score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())
    return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
r2:0.80
neg_mean_squared_error:-13.48





[0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time
import datetime

for i in [0,0.2,0.5,1]:
    time0=time()
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    print("learning_rate = {}".format(i))
    regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781
	
learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888
	
learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875
	
learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027
axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
    test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)
    rs.append(score[0])
    te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()
0.55 0.8125604372670463

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Kt53b60m-1619417048287)(output_47_1.png)]

for booster in ["gbtree","gblinear","dart"]:
    reg = XGBR(n_estimators=180
               ,learning_rate=0.1
               ,random_state=420
               ,booster=booster).fit(Xtrain,Ytrain)
    print(booster)
    print(reg.score(Xtest,Ytest))
gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575
#默认reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)
0.9231068620728082
MSE(Ytest,reg.predict(Xtest))
7.155205217161047
#xgb实现法
import xgboost as xgb
#使用类DMatrix读取数据
dtrain = xgb.DMatrix(Xtrain,Ytrain) #特征矩阵和标签都进行一个传入
dtest = xgb.DMatrix(Xtest,Ytest)
#非常遗憾无法打开来查看,所以通常都是先读到pandas里面查看之后再放到DMatrix中
dtrain
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
pd.DataFrame(Xtrain)
0123456789101112
00.030410.05.190.00.5155.89559.65.61505.0224.020.2394.8110.56
10.0411325.04.860.00.4266.72733.55.40074.0281.019.0396.905.29
210.233000.018.100.00.6146.18596.72.170524.0666.020.2379.7018.03
30.171420.06.910.00.4485.68233.85.10043.0233.017.9396.9010.21
40.050590.04.490.00.4496.38948.04.77943.0247.018.5396.909.62
50.135870.010.591.00.4896.06459.14.23924.0277.018.6381.3214.66
60.0498121.05.640.00.4395.99821.46.81474.0243.016.8396.908.43
70.0254355.03.780.00.4846.69656.45.73215.0370.017.6396.907.18
80.107930.08.560.00.5206.19554.42.77785.0384.020.9393.4913.00
90.024980.01.890.00.5186.54059.76.26691.0422.015.9389.968.65
100.092990.025.650.00.5815.96192.92.08692.0188.019.1378.0917.93
110.158760.010.810.00.4135.96117.55.28734.0305.019.2376.949.88
126.717720.018.100.00.7136.74992.62.323624.0666.020.20.3217.44
130.0376880.01.520.00.4047.27438.37.30902.0329.012.6392.206.62
145.201770.018.101.00.7706.12783.42.722724.0666.020.2395.4311.48
1511.087400.018.100.00.7186.411100.01.858924.0666.020.2318.7515.02
160.114320.08.560.00.5206.78171.32.85615.0384.020.9395.587.67
170.056020.02.460.00.4887.83153.63.19923.0193.017.8392.634.45
180.241030.07.380.00.4936.08343.75.41595.0287.019.6396.9012.79
190.0937812.57.870.00.5245.88939.05.45095.0311.015.2390.5015.71
208.716750.018.100.00.6936.47198.81.725724.0666.020.2391.9817.12
217.367110.018.100.00.6796.19378.11.935624.0666.020.296.7321.52
221.387990.08.140.00.5385.95082.03.99004.0307.021.0232.6027.71
2314.333700.018.100.00.6146.22988.01.951224.0666.020.2383.3213.11
2428.655800.018.100.00.5975.155100.01.589424.0666.020.2210.9720.08
250.802710.08.140.00.5385.45636.63.79654.0307.021.0288.9911.69
261.002450.08.140.00.5386.67487.34.23904.0307.021.0380.2311.98
279.916550.018.100.00.6935.85277.81.500424.0666.020.2338.1629.97
280.131580.010.010.00.5476.17672.52.73016.0432.017.8393.3012.04
290.142310.010.010.00.5476.25484.22.25656.0432.017.8388.7410.45
..........................................
3240.131170.08.560.00.5206.12785.22.12245.0384.020.9387.6914.09
3251.354720.08.140.00.5386.072100.04.17504.0307.021.0376.7313.04
3260.101530.012.830.00.4376.27974.54.05225.0398.018.7373.6611.97
3270.229270.06.910.00.4486.03085.55.68943.0233.017.9392.7418.80
3280.0466680.01.520.00.4047.10736.67.30902.0329.012.6354.318.61
3290.080140.05.960.00.4995.85041.53.93425.0279.019.2396.908.77
3300.407710.06.201.00.5076.16491.33.04808.0307.017.4395.2421.46
3310.136420.010.590.00.4895.89122.33.94544.0277.018.6396.9010.87
3329.329090.018.100.00.7136.18598.72.261624.0666.020.2396.9018.13
3330.091030.02.460.00.4887.15592.22.70063.0193.017.8394.124.82
3340.0130135.01.520.00.4427.24149.37.03791.0284.015.5394.745.49
3350.590050.021.890.00.6246.37297.92.32744.0437.021.2385.7611.12
3361.126580.019.581.00.8715.01288.01.61025.0403.014.7343.2812.12
3370.0788680.04.950.00.4117.14827.75.11674.0245.019.2396.903.56
3380.217190.010.591.00.4895.80753.83.65264.0277.018.6390.9416.03
3390.537000.06.200.00.5045.98168.13.67158.0307.017.4378.3511.65
3403.321050.019.581.00.8715.403100.01.32165.0403.014.7396.9026.82
3411.496320.019.580.00.8715.404100.01.59165.0403.014.7341.6013.28
3420.387350.025.650.00.5815.61395.61.75722.0188.019.1359.2927.26
3430.066170.03.240.00.4605.86825.85.21464.0430.016.9382.449.97
3440.7857020.03.970.00.6477.01484.62.13295.0264.013.0384.0714.79
3451.413850.019.581.00.8716.12996.01.74945.0403.014.7321.0215.12
3460.060470.02.460.00.4886.15368.83.27973.0193.017.8387.1113.15
3478.492130.018.100.00.5846.34886.12.052724.0666.020.283.4517.64
3480.171340.010.010.00.5475.92888.22.46316.0432.017.8344.9115.76
3490.0387152.55.320.00.4056.20931.37.31726.0293.016.6396.907.14
3500.1265025.05.130.00.4536.76243.47.98098.0284.019.7395.589.50
3516.962150.018.100.00.7005.71397.01.926524.0666.020.2394.4317.11
3520.091640.010.810.00.4136.0657.85.28734.0305.019.2390.915.52
3535.581070.018.100.00.7136.43687.92.315824.0666.020.2100.1916.22

354 rows × 13 columns

#写明参数
param = {'silent':True #默认为False,通常要手动把它关闭掉
         ,'objective':'reg:linear'
         ,"eta":0.1}
num_round = 180 #n_estimators
#类train,可以直接导入的参数是训练数据,树的数量,其他参数都需要通过params来导入
bst = xgb.train(param, dtrain, num_round)
#接口predict
preds = bst.predict(dtest)
preds
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,
       23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,
       24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,
       10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,
       17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,
       15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,
       36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,
       23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,
       18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,
       31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,
       26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,
       25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,
       14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,
       18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,
       29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,
       23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,
       19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,
       24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,
       20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,
       10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,
       30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,
       42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,
       45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,
       28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,
       24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,
       20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,
       17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,
       20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,
       41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,
       20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,
       14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score
r2_score(Ytest,preds)
0.9260984298390122
MSE(Ytest,preds)
6.87682821415069
import xgboost as xgb

#为了便捷,使用全数据
dfull = xgb.DMatrix(X,y)
#设定参数
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold
#使用类xgb.cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:610364
#看看类xgb.cv生成了什么结果?
cvresult1 #随着树不断增加,我们的模型的效果如何变化
train-rmse-meantrain-rmse-stdtest-rmse-meantest-rmse-std
017.1055780.12911617.1632150.584296
112.3379730.09755712.5197360.473458
28.9940710.0657569.4045340.472310
36.6294810.0503237.2503350.500342
44.9544060.0332095.9208120.591874
53.7814540.0296045.0451900.687971
62.9477670.0387864.4720300.686492
72.3577480.0420404.1793140.737935
81.9519070.0449723.9798780.798198
91.6608950.0448943.8707510.812331
101.4642960.0494223.8161960.835251
111.3233620.0562403.7881250.841643
121.2144680.0465243.7669730.848989
131.1373110.0445223.7411990.872370
141.0646290.0422453.7291940.879429
151.0102860.0388923.7179970.879572
160.9412580.0383603.7067360.878032
170.8835990.0566403.6938860.873913
180.8296740.0572843.6932960.883429
190.7723320.0428993.6875100.880928
200.7315570.0491503.6870370.879180
210.6906980.0411903.6775070.882060
220.6577430.0421373.6753430.883635
230.6199880.0540973.6710060.879224
240.5854140.0525853.6709510.867470
250.5487230.0544403.6735980.863241
260.5272660.0496303.6739880.867116
270.5044050.0403763.6717020.864566
280.4685340.0330203.6713240.862536
290.4486330.0321913.6750740.864713
...............
700.0710570.0154113.6680670.859435
710.0679460.0139603.6677080.859370
720.0651970.0124753.6681740.859307
730.0627890.0125383.6687380.859471
740.0602940.0126693.6689500.860112
750.0582780.0120553.6690840.859966
760.0554020.0110653.6696270.859505
770.0538190.0110723.6699040.859294
780.0512800.0112153.6701850.859204
790.0487480.0099883.6700920.859250
800.0469720.0092333.6698690.858892
810.0447530.0086643.6697020.858676
820.0431480.0086363.6697040.858921
830.0418230.0083553.6695960.858843
840.0402570.0083783.6697300.858459
850.0385180.0077313.6698350.858698
860.0366940.0069283.6697050.858958
870.0349320.0061743.6697220.858715
880.0339470.0062063.6699640.858547
890.0327060.0061763.6699880.858516
900.0313170.0061713.6701160.858512
910.0296970.0054733.6699300.858759
920.0285610.0055993.6699060.858549
930.0275850.0056943.6698220.858554
940.0264360.0054143.6699850.858390
950.0252040.0051453.6699210.858313
960.0244220.0052423.6699830.858255
970.0236610.0051173.6699470.858331
980.0225620.0047043.6698680.858578
990.0214960.0047383.6698240.858305

100 rows × 4 columns

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

#从这个图中,我们可以看出什么?
#怎样从图中观察模型的泛化能力?
#从这个图的角度来说,模型的调参目标是什么?

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OVs9GHP4-1619417048288)(output_66_0.png)]

#xgboost中回归模型的默认模型评估指标是什么?
param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fSCBcwL6-1619417048288)(output_68_0.png)]

param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:083104
time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:359378
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()

#从这里,你看出gamma是如何控制过拟合了吗?控制训练集上的训练 - 降低训练集上的表现

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nW9wsd4l-1619417048289)(output_71_0.png)]

import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()

x2 = data2.data
y2 = data2.target

dfull2 = xgb.DMatrix(x2,y2)

param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5
          ,"eval_metrics":"error"
         }
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100
time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:271581
time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error")) 
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:443810
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-40pJXnNG-1619417048289)(output_76_0.png)]

dfull = xgb.DMatrix(X,y)

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()
00:00:513584

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-23NZRQ9V-1619417048290)(output_78_1.png)]

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")

param2 = {'silent':True
          ,'obj':'reg:linear'
          ,"max_depth":2
          ,"eta":0.05
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":0.4
          ,"colsample_bynode":1
          ,"nfold":5}

param3 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1
          ,"nfold":5}

time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()
00:00:532621
00:00:223373
00:00:259346

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xdWPZhuA-1619417048290)(output_79_1.png)]

import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain)

#设定参数,对模型进行训练
param = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1}
num_round = 180

bst = xgb.train(param, dtrain, num_round)
#保存模型
pickle.dump(bst, open("xgboostonboston.dat","wb"))

#注意,open中我们往往使用w或者r作为读取的模式,但其实w与r只能用于文本文件 - txt
#当我们希望导入的不是文本文件,而是模型本身的时候,我们使用"wb"和"rb"作为读取的模式
#其中wb表示以二进制写入,rb表示以二进制读入,使用open进行保存的这个文件中是一个可以进行读取或者调用的模型
#看看模型被保存到了哪里?
import sys
sys.path
['C:\\Pythonwork\\micro-class\\11 xgboost',
 'C:\\Python\\python37.zip',
 'C:\\Python\\DLLs',
 'C:\\Python\\lib',
 'C:\\Python',
 '',
 'C:\\Python\\lib\\site-packages',
 'C:\\Python\\lib\\site-packages\\win32',
 'C:\\Python\\lib\\site-packages\\win32\\lib',
 'C:\\Python\\lib\\site-packages\\Pythonwin',
 'C:\\Python\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Shuyu\\.ipython']
#重新打开jupyter lab

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgb

data = load_boston()

X = data.data
y = data.target

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
#注意,如果我们保存的模型是xgboost库中建立的模型,则导入的数据类型也必须是xgboost库中的数据类型
dtest = xgb.DMatrix(Xtest,Ytest)
#导入模型
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
#做预测,直接调用接口predict
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
bst = xgb.train(param, dtrain, num_round)
import joblib

#同样可以看看模型被保存到了哪里
joblib.dump(bst,"xgboost-boston.dat")
['xgboost-boston.dat']
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
#使用sklearn中的模型
from xgboost import XGBRegressor as XGBR

bst = XGBR(n_estimators=200
           ,eta=0.05,gamma=20
           ,reg_lambda=3.5
           ,reg_alpha=0.2
           ,max_depth=4
           ,colsample_bytree=0.4
           ,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #训练完毕
joblib.dump(bst,"xgboost-boston-sklearn.dat")
['xgboost-boston-sklearn.dat']
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
#则这里可以直接导入Xtest,直接是我们的numpy
ypreds = loaded_model.predict(Xtest)
Xtest
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.29460e+02, 2.73800e+01],
       [2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,
        3.96900e+02, 9.14000e+00],
       [3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,
        3.96900e+02, 4.56000e+00],
       ...,
       [5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,
        3.89710e+02, 5.68000e+00],
       [3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        2.20100e+01, 1.71500e+01],
       [1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.96900e+02, 1.34400e+01]])
dtest
<xgboost.core.DMatrix at 0x29e30670668>
ypreds
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,
       20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,
       22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,
       10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,
       18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,
       15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,
       35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,
       25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,
       17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,
       27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,
       26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,
       24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,
       16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,
       18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,
       28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,
       24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,
       18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,
       23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,
       21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,
       12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,
       32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,
       41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,
       46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,
       27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,
       23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,
       23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,
       14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,
       20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,
       40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,
       19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,
       15.041253 , 28.63522  ], dtype=float32)
MSE(Ytest, ypreds)
10.198269690947479
r2_score(Ytest,ypreds)
0.8904046866351292
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #自创数据集
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #类别1有500个样本
class_2 = 50 #类别2只有50个
centers = [[0.0, 0.0], [2.0, 2.0]] #设定两个类别的中心
clusters_std = [1.5, 0.5] #设定两个类别的方差,通常来说,样本量比较大的类别会更加松散
X, y = make_blobs(n_samples=[class_1, class_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0, shuffle=False)
X.shape
(550, 2)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
0.09090909090909091
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#在sklearn下建模#

clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
ypred
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #默认模型评估指标 - 准确率
0.9272727272727272
cm(Ytest,ypred,labels=[1,0]) #少数类写在前面
array([[  9,   4],
       [  8, 144]], dtype=int64)
recall(Ytest,ypred)
0.6923076923076923
auc(Ytest,clf.predict_proba(Xtest)[:,1])
0.9671052631578947
#负/正样本比例
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)

cm(Ytest,ypred_,labels=[1,0])

recall(Ytest,ypred_)

auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9696356275303644
#随着样本权重逐渐增加,模型的recall,auc和准确率如何变化?
for i in [1,5,10,20,30]:
    clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
    ypred_ = clf_.predict(Xtest)
    print(i)
    print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))
    print("\tRecall:{}".format(recall(Ytest,ypred_)))
    print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9671052631578947
5
	Accuracy:0.9454545454545454
	Recall:0.9230769230769231
	AUC:0.9665991902834008
10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9696356275303644
20
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9706477732793523
30
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9701417004048584
#负/正样本比例
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
0.9515151515151515
cm(Ytest,ypred_,labels=[1,0])
array([[ 13,   0],
       [  8, 144]], dtype=int64)
recall(Ytest,ypred_)
1.0
auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9706477732793523
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
#看看xgboost库自带的predict接口
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#看看preds返回了什么?
preds
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,
       0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,
       0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,
       0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,
       0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,
       0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,
       0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,
       0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,
       0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,
       0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,
       0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,
       0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,
       0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,
       0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,
       0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,
       0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,
       0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,
       0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,
       0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,
       0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,
       0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],
      dtype=float32)
#自己设定阈值
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
#写明参数
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1"
         ,"negative vs positive: 5"
         ,"negative vs positive: 10"]
[*zip(names,scale_pos_weight)]
[('negative vs positive: 1', 1),
 ('negative vs positive: 5', 5),
 ('negative vs positive: 10', 10)]
#导入模型评估指标
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc

for name,i in zip(names,scale_pos_weight):
    param = {'silent':True,'objective':'binary:logistic'
            ,"eta":0.1,"scale_pos_weight":i}
    num_round = 100
    clf = xgb.train(param, dtrain, num_round)
    preds = clf.predict(dtest)
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    print(name)
    print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
    print("\tRecall:{}".format(recall(Ytest,ypred)))
    print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
#当然我们也可以尝试不同的阈值
for name,i in zip(names,scale_pos_weight):
    for thres in [0.3,0.5,0.7,0.9]:
        param= {'silent':True,'objective':'binary:logistic'
                ,"eta":0.1,"scale_pos_weight":i}
        clf = xgb.train(param, dtrain, num_round)
        preds = clf.predict(dtest)
        ypred = preds.copy()
        ypred[preds > thres] = 1
        ypred[ypred != 1] = 0
        print("{},thresholds:{}".format(name,thres))
        print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
        print("\tRecall:{}".format(recall(Ytest,ypred)))
        print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9
	Accuracy:0.9515151515151515
	Recall:0.5384615384615384
	AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9665991902834008