使用python+sklearn的决策树方法预测是否有信用风险
1个回答
展开全部
import numpy as np11
import pandas as pd11
names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')11
data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)11
data.head()11
Balance
Duration
History
Purpose
Credit amount
Savings
Employment
instPercent
sexMarried
Guarantors
…
Assets
Age
concCredit
Apartment
Credits
Occupation
Dependents
hasPhone
Foreign
lable
0
A11 6 A34 A43 1169 A65 A75 4 A93 A101 … A121 67 A143 A152 2 A173 1 A192 A201 1
1
A12 48 A32 A43 5951 A61 A73 2 A92 A101 … A121 22 A143 A152 1 A173 1 A191 A201 2
2
A14 12 A34 A46 2096 A61 A74 2 A93 A101 … A121 49 A143 A152 1 A172 2 A191 A201 1
3
A11 42 A32 A42 7882 A61 A74 2 A93 A103 … A122 45 A143 A153 1 A173 2 A191 A201 1
4
A11 24 A33 A40 4870 A61 A73 3 A93 A101 … A124 53 A143 A153 2 A173 2 A191 A201 2
5 rows × 21 columns
data.Balance.unique()11
array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)data.count()11
Balance 1000 Duration 1000 History 1000 Purpose 1000 Credit amount 1000 Savings 1000 Employment 1000 instPercent 1000 sexMarried 1000 Guarantors 1000 Residence duration 1000 Assets 1000 Age 1000 concCredit 1000 Apartment 1000 Credits 1000 Occupation 1000 Dependents 1000 hasPhone 1000 Foreign 1000 lable 1000 dtype: int64#部分变量描述性统计分析
data.describe()1212
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
lable
count
1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean
20.903000 3271.258000 2.973000 2.845000 35.546000 1.407000 1.155000 1.300000
std
12.058814 2822.736876 1.118715 1.103718 11.375469 0.577654 0.362086 0.458487
min
4.000000 250.000000 1.000000 1.000000 19.000000 1.000000 1.000000 1.000000
25%
12.000000 1365.500000 2.000000 2.000000 27.000000 1.000000 1.000000 1.000000
50%
18.000000 2319.500000 3.000000 3.000000 33.000000 1.000000 1.000000 1.000000
75%
24.000000 3972.250000 4.000000 4.000000 42.000000 2.000000 1.000000 2.000000
max
72.000000 18424.000000 4.000000 4.000000 75.000000 4.000000 2.000000 2.000000
data.Duration.unique()11
array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)data.History.unique()11
array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)data.groupby('Balance').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Balance A14 394 A11 274 A12 269 A13 63 dtype: int64data.groupby('Purpose').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Purpose A43 280 A40 234 A42 181 A41 103 A49 97 A46 50 A45 22 A44 12 A410 12 A48 9 dtype: int64data.groupby('Apartment').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Apartment A152 713 A151 179 A153 108 dtype: int64import matplotlib.pyplot as plt
%matplotlib inline
data.plot(x='lable', y='Age', kind='scatter',
alpha=0.02, s=50);12341234
![png](output_13_0.png)data.hist('Age', bins=15);11
![png](output_14_0.png)target=data.lable11
features_data=data.drop('lable',axis=1)11
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量11
numeric_features11
[‘Duration’, ‘Credit amount’, ‘instPercent’, ‘Residence duration’, ‘Age’, ‘Credits’, ‘Dependents’]numeric_data = features_data[numeric_features]11
numeric_data.head()11
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
0
6 1169 4 4 67 2 1
1
48 5951 2 2 22 1 1
2
12 2096 2 3 49 1 2
3
42 7882 2 4 45 1 2
4
24 4870 3 4 53 2 2
categorical_data = features_data.drop(numeric_features, axis=1)11
categorical_data.head()11
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
A11 A34 A43 A65 A75 A93 A101 A121 A143 A152 A173 A192 A201
1
A12 A32 A43 A61 A73 A92 A101 A121 A143 A152 A173 A191 A201
2
A14 A34 A46 A61 A74 A93 A101 A121 A143 A152 A172 A191 A201
3
A11 A32 A42 A61 A74 A93 A103 A122 A143 A153 A173 A191 A201
4
A11 A33 A40 A61 A73 A93 A101 A124 A143 A153 A173 A191 A201
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
# apply运算将转换函数应用到每一个变量维度
categorical_data_encoded.head(5)123123
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
0 0 0 0 0 0 0 0 0 0 0 0 0
1
1 1 0 1 1 1 0 0 0 0 0 1 0
2
2 0 1 1 2 0 0 0 0 0 1 1 0
3
0 1 2 1 2 0 1 1 0 1 0 1 0
4
0 2 3 1 1 0 0 2 0 1 0 1 0
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并
features.head()
# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:
# features = pd.get_dummies(features_data)
# features.head()1234512345
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
6 1169 4 4 67 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
1
48 5951 2 2 22 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0
2
12 2096 2 3 49 1 2 2 0 1 1 2 0 0 0 0 0 1 1 0
3
42 7882 2 4 45 1 2 0 1 2 1 2 0 1 1 0 1 0 1 0
4
24 4870 3 4 53 2 2 0 2 3 1 1 0 0 2 0 1 0 1 0
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == 1).astype(np.int32) # 1:good,2:bad1212
from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比
1234512345
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度
# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))123456789123456789
ROC AUC Decision Tree: 0.6866 +/-0.0105
#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
scoring=None):
plt.title("Learning curves for %s" % type(estimator).__name__)
plt.ylim(*ylim); plt.grid()
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, validation_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))12345678910111213141516171819202122231234567891011121314151617181920212223
clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了123123
Best validation score: 0.6310
clf = DecisionTreeClassifier(max_depth=10)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.6565
clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.6762
clf = DecisionTreeClassifier(max_depth=5)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.7219
clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.7226
import pandas as pd11
names=("Balance,Duration,History,Purpose,Credit amount,Savings,Employment,instPercent,sexMarried,Guarantors,Residence duration,Assets,Age,concCredit,Apartment,Credits,Occupation,Dependents,hasPhone,Foreign,lable").split(',')11
data=pd.read_csv("Desktop/sunshengyun/data/german/german.data",sep='\s+',names=names)11
data.head()11
Balance
Duration
History
Purpose
Credit amount
Savings
Employment
instPercent
sexMarried
Guarantors
…
Assets
Age
concCredit
Apartment
Credits
Occupation
Dependents
hasPhone
Foreign
lable
0
A11 6 A34 A43 1169 A65 A75 4 A93 A101 … A121 67 A143 A152 2 A173 1 A192 A201 1
1
A12 48 A32 A43 5951 A61 A73 2 A92 A101 … A121 22 A143 A152 1 A173 1 A191 A201 2
2
A14 12 A34 A46 2096 A61 A74 2 A93 A101 … A121 49 A143 A152 1 A172 2 A191 A201 1
3
A11 42 A32 A42 7882 A61 A74 2 A93 A103 … A122 45 A143 A153 1 A173 2 A191 A201 1
4
A11 24 A33 A40 4870 A61 A73 3 A93 A101 … A124 53 A143 A153 2 A173 2 A191 A201 2
5 rows × 21 columns
data.Balance.unique()11
array([‘A11’, ‘A12’, ‘A14’, ‘A13’], dtype=object)data.count()11
Balance 1000 Duration 1000 History 1000 Purpose 1000 Credit amount 1000 Savings 1000 Employment 1000 instPercent 1000 sexMarried 1000 Guarantors 1000 Residence duration 1000 Assets 1000 Age 1000 concCredit 1000 Apartment 1000 Credits 1000 Occupation 1000 Dependents 1000 hasPhone 1000 Foreign 1000 lable 1000 dtype: int64#部分变量描述性统计分析
data.describe()1212
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
lable
count
1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000
mean
20.903000 3271.258000 2.973000 2.845000 35.546000 1.407000 1.155000 1.300000
std
12.058814 2822.736876 1.118715 1.103718 11.375469 0.577654 0.362086 0.458487
min
4.000000 250.000000 1.000000 1.000000 19.000000 1.000000 1.000000 1.000000
25%
12.000000 1365.500000 2.000000 2.000000 27.000000 1.000000 1.000000 1.000000
50%
18.000000 2319.500000 3.000000 3.000000 33.000000 1.000000 1.000000 1.000000
75%
24.000000 3972.250000 4.000000 4.000000 42.000000 2.000000 1.000000 2.000000
max
72.000000 18424.000000 4.000000 4.000000 75.000000 4.000000 2.000000 2.000000
data.Duration.unique()11
array([ 6, 48, 12, 42, 24, 36, 30, 15, 9, 10, 7, 60, 18, 45, 11, 27, 8, 54, 20, 14, 33, 21, 16, 4, 47, 13, 22, 39, 28, 5, 26, 72, 40], dtype=int64)data.History.unique()11
array([‘A34’, ‘A32’, ‘A33’, ‘A30’, ‘A31’], dtype=object)data.groupby('Balance').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Balance A14 394 A11 274 A12 269 A13 63 dtype: int64data.groupby('Purpose').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Purpose A43 280 A40 234 A42 181 A41 103 A49 97 A46 50 A45 22 A44 12 A410 12 A48 9 dtype: int64data.groupby('Apartment').size().order(ascending=False)11
c:\python27\lib\site-packages\ipykernel\__main__.py:1: FutureWarning: order is deprecated, use sort_values(…) if __name__ == ‘__main__’: Apartment A152 713 A151 179 A153 108 dtype: int64import matplotlib.pyplot as plt
%matplotlib inline
data.plot(x='lable', y='Age', kind='scatter',
alpha=0.02, s=50);12341234
![png](output_13_0.png)data.hist('Age', bins=15);11
![png](output_14_0.png)target=data.lable11
features_data=data.drop('lable',axis=1)11
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')] # 提取数值类型为整数或浮点数的变量11
numeric_features11
[‘Duration’, ‘Credit amount’, ‘instPercent’, ‘Residence duration’, ‘Age’, ‘Credits’, ‘Dependents’]numeric_data = features_data[numeric_features]11
numeric_data.head()11
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
0
6 1169 4 4 67 2 1
1
48 5951 2 2 22 1 1
2
12 2096 2 3 49 1 2
3
42 7882 2 4 45 1 2
4
24 4870 3 4 53 2 2
categorical_data = features_data.drop(numeric_features, axis=1)11
categorical_data.head()11
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
A11 A34 A43 A65 A75 A93 A101 A121 A143 A152 A173 A192 A201
1
A12 A32 A43 A61 A73 A92 A101 A121 A143 A152 A173 A191 A201
2
A14 A34 A46 A61 A74 A93 A101 A121 A143 A152 A172 A191 A201
3
A11 A32 A42 A61 A74 A93 A103 A122 A143 A153 A173 A191 A201
4
A11 A33 A40 A61 A73 A93 A101 A124 A143 A153 A173 A191 A201
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0]) # pd.factorize即可将分类变量转换为数值表示
# apply运算将转换函数应用到每一个变量维度
categorical_data_encoded.head(5)123123
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
0 0 0 0 0 0 0 0 0 0 0 0 0
1
1 1 0 1 1 1 0 0 0 0 0 1 0
2
2 0 1 1 2 0 0 0 0 0 1 1 0
3
0 1 2 1 2 0 1 1 0 1 0 1 0
4
0 2 3 1 1 0 0 2 0 1 0 1 0
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)#进行数据的合并
features.head()
# 此处也可以选用one-hot编码来表示分类变量,相应的程序如下:
# features = pd.get_dummies(features_data)
# features.head()1234512345
Duration
Credit amount
instPercent
Residence duration
Age
Credits
Dependents
Balance
History
Purpose
Savings
Employment
sexMarried
Guarantors
Assets
concCredit
Apartment
Occupation
hasPhone
Foreign
0
6 1169 4 4 67 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
1
48 5951 2 2 22 1 1 1 1 0 1 1 1 0 0 0 0 0 1 0
2
12 2096 2 3 49 1 2 2 0 1 1 2 0 0 0 0 0 1 1 0
3
42 7882 2 4 45 1 2 0 1 2 1 2 0 1 1 0 1 0 1 0
4
24 4870 3 4 53 2 2 0 2 3 1 1 0 0 2 0 1 0 1 0
X = features.values.astype(np.float32) # 转换数据类型
y = (target.values == 1).astype(np.int32) # 1:good,2:bad1212
from sklearn.cross_validation import train_test_split # sklearn库中train_test_split函数可实现该划分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=0) # 参数test_size设置训练集占比
1234512345
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(max_depth=8) # 参数max_depth设置树最大深度
# 交叉验证,评价分类器性能,此处选择的评分标准是ROC曲线下的AUC值,对应AUC更大的分类器效果更好
scores = cross_val_score(clf, X_train, y_train, cv=3, scoring='roc_auc')
print("ROC AUC Decision Tree: {:.4f} +/-{:.4f}".format(
np.mean(scores), np.std(scores)))123456789123456789
ROC AUC Decision Tree: 0.6866 +/-0.0105
#利用learning curve,以样本数为横坐标,训练和交叉验证集上的评分为纵坐标,对不同深度的决策树进行对比(判断是否存在过拟合或欠拟合)
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=3,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5),
scoring=None):
plt.title("Learning curves for %s" % type(estimator).__name__)
plt.ylim(*ylim); plt.grid()
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, validation_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
validation_scores_mean = np.mean(validation_scores, axis=1)
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))12345678910111213141516171819202122231234567891011121314151617181920212223
clf = DecisionTreeClassifier(max_depth=None)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')
# 可以注意到训练数据和交叉验证数据的得分有很大的差距,意味着可能过度拟合训练数据了123123
Best validation score: 0.6310
clf = DecisionTreeClassifier(max_depth=10)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.6565
clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.6762
clf = DecisionTreeClassifier(max_depth=5)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.7219
clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')1212
Best validation score: 0.7226
光点科技
2023-08-15 广告
2023-08-15 广告
通常情况下,我们会按照结构模型把系统产生的数据分为三种类型:结构化数据、半结构化数据和非结构化数据。结构化数据,即行数据,是存储在数据库里,可以用二维表结构来逻辑表达实现的数据。最常见的就是数字数据和文本数据,它们可以某种标准格式存在于文件...
点击进入详情页
本回答由光点科技提供
推荐律师服务:
若未解决您的问题,请您详细描述您的问题,通过百度律临进行免费专业咨询