DAY 15

发布时间:2026/7/6 1:22:30
DAY 15 浙大疏锦行代码极简逻辑1. 数据预处理读取信贷csv分离特征与违约标签文本特征编码查看样本是否不平衡。2. 数据集拆分分层划分训练/测试集保证两组违约样本比例一致。3. 搭建4套对比流水线防数据泄露基线标准化随机森林对照组SMOTE过采样合成少数违约样本平衡数据SMOTEENN混合采样过采样剔除噪声权重平衡不改动数据训练时加重少数类损失4. 网格搜索分层5折交叉验证批量训练4套模型以F1为优化目标输出召回率、精确率、AUC。5. 横向对比汇总所有方案指标自动选出F1最高的最优模型。6. 阈值优化不用默认0.5阈值通过PR曲线找到F1最佳分割点优化风控预测效果并绘图展示。7. 可选保存最优模型用于后续预测。#DAY 15 不平衡数据集的处理信贷数据集 缺失值填充 不平衡处理 交叉验证 超参数调优import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCVfrom sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.impute import SimpleImputerfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipelinefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curvefrom imblearn.pipeline import Pipeline as ImbPipelinefrom imblearn.over_sampling import SMOTEfrom imblearn.combine import SMOTEENNimport warningswarnings.filterwarnings(“ignore”)中文绘图设置plt.rcParams[‘font.sans-serif’] [‘SimHei’, ‘Microsoft YaHei’, ‘DejaVu Sans’]plt.rcParams[‘axes.unicode_minus’] False1.读取数据file_path rC:\Python Study\Python60DaysChallenge-main\data.csvdata pd.read_csv(file_path)print(“数据集形状:”, data.shape)print(“\n数据集全部列名”)print(data.columns.tolist())TARGET_COL ‘Credit Default’分离特征标签X data.drop(columns[TARGET_COL])y data[TARGET_COL]区分数值列、文本分类列num_cols X.select_dtypes(include[np.number]).columns.tolist()cat_cols X.select_dtypes(include[‘object’]).columns.tolist()打印缺失值数量print(“\n各列缺失值统计”)print(X.isnull().sum())文本特征编码le_dict {}for col in cat_cols:le LabelEncoder()X[col] le.fit_transform(X[col].astype(str).fillna(“Missing”))le_dict[col] le数值缺失先简单填充预处理阶段临时处理X[num_cols] SimpleImputer(strategy“median”).fit_transform(X[num_cols])类别分布可视化print(“\n原始数据集目标变量分布:”)print(y.value_counts())plt.figure(figsize(6,4))sns.countplot(xy)plt.title(‘信贷违约类别分布’)plt.show()2.分层划分训练测试集X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42, stratifyy)print(f\n训练集分布: {pd.Series(y_train).value_counts().to_dict()}“)print(f测试集分布: {pd.Series(y_test).value_counts().to_dict()}”)3.基础配置base_clf RandomForestClassifier(random_state42)param_grid_common {‘classifier__n_estimators’: [50, 100],‘classifier__max_depth’: [5, 10],‘classifier__min_samples_split’: [2, 5]}cv StratifiedKFold(n_splits5, shuffleTrue, random_state42)4.四条流水线数据已提前填充无NaNSMOTE可正常运行基线pipeline_baseline ImbPipeline([(‘scaler’, StandardScaler()),(‘classifier’, base_clf)])param_baseline param_grid_common.copy()SMOTEpipeline_smote ImbPipeline([(‘scaler’, StandardScaler()),(‘sampler’, SMOTE(random_state42)),(‘classifier’, base_clf)])param_smote {**param_grid_common, ‘sampler__k_neighbors’: [3, 5]}SMOTEENNpipeline_smotenn ImbPipeline([(‘scaler’, StandardScaler()),(‘sampler’, SMOTEENN(random_state42)),(‘classifier’, base_clf)])param_smotenn param_grid_common.copy()权重平衡pipeline_weighted ImbPipeline([(‘scaler’, StandardScaler()),(‘classifier’, RandomForestClassifier(random_state42, class_weight‘balanced’))])param_weighted param_grid_common.copy()5.网格搜索函数def run_gridsearch(pipeline, param_grid, name):print(f\n{‘’*60}“)print(f正在运行策略: {name}”)print(f{‘’*60}“)gs GridSearchCV(pipeline, param_grid, cvcv, scoring‘f1’, n_jobs-1, verbose1)gs.fit(X_train, y_train)print(f最佳参数组合: {gs.best_params_}”)print(f交叉验证最佳 F1 (平均): {gs.best_score_:.4f})y_pred gs.best_estimator_.predict(X_test) test_f1 f1_score(y_test, y_pred) test_recall recall_score(y_test, y_pred) test_precision precision_score(y_test, y_pred) test_auc roc_auc_score(y_test, gs.best_estimator_.predict_proba(X_test)[:,1]) print(f测试集 F1: {test_f1:.4f}) print(f测试集 召回率(Recall): {test_recall:.4f}) print(f测试集 精确率(Precision): {test_precision:.4f}) print(f测试集 AUC: {test_auc:.4f}) print(\n分类报告:) print(classification_report(y_test, y_pred)) print(混淆矩阵:) print(confusion_matrix(y_test, y_pred)) return gs.best_estimator_, test_f1, test_recall, test_precision, test_auc6.批量训练对比results {}best_models {}strategies [(‘Baseline’, pipeline_baseline, param_baseline),(‘SMOTE’, pipeline_smote, param_smote),(‘SMOTEENN’, pipeline_smotenn, param_smotenn),(‘Weighted’, pipeline_weighted, param_weighted)]for name, pipe, params in strategies:model, f1, rec, prec, auc run_gridsearch(pipe, params, name)results[name] {‘F1’: f1, ‘Recall’: rec, ‘Precision’: prec, ‘AUC’: auc}best_models[name] model7.结果汇总print(“\n\n” “”*60)print(“各策略性能对比”)print(“”*60)df_results pd.DataFrame(results).Tprint(df_results.round(4))best_strategy df_results[‘F1’].idxmax()best_model best_models[best_strategy]print(f\n最优策略: {best_strategy}F1 {df_results.loc[best_strategy, ‘F1’]:.4f})8.阈值调优绘图print(“\n阈值微调”)y_proba best_model.predict_proba(X_test)[:, 1]precisions, recalls, thresholds precision_recall_curve(y_test, y_proba)fscores 2 * (precisions * recalls) / (precisions recalls 1e-9)ix np.argmax(fscores[:-1])best_threshold thresholds[ix]print(f最优阈值: {best_threshold:.4f}“)print(f对应F1:{fscores[ix]:.4f} 召回:{recalls[ix]:.4f} 精确:{precisions[ix]:.4f}”)y_pred_new (y_proba best_threshold).astype(int)print(“\n调整阈值后分类报告”)print(classification_report(y_test, y_pred_new))绘图plt.figure(figsize(12, 5))plt.subplot(1, 2, 1)plt.plot(thresholds, precisions[:-1], ‘–’, label‘Precision’)plt.plot(thresholds, recalls[:-1], ‘:’, label‘Recall’)plt.plot(thresholds, fscores[:-1], linewidth2, label‘F1’)plt.scatter(best_threshold, fscores[ix], c‘red’, s100)plt.xlabel(“Threshold”)plt.grid(True)plt.legend()plt.subplot(1, 2, 2)plt.plot(recalls, precisions)plt.scatter(recalls[ix], precisions[ix], c‘red’, s100)plt.xlabel(“Recall”)plt.ylabel(“Precision”)plt.grid(True)plt.tight_layout()plt.show()print(“全部执行完毕”)