DAY 15

发布时间：2026/7/6 1:22:30

浙大疏锦行代码极简逻辑1. 数据预处理读取信贷csv分离特征与违约标签文本特征编码查看样本是否不平衡。2. 数据集拆分分层划分训练/测试集保证两组违约样本比例一致。3. 搭建4套对比流水线防数据泄露基线标准化随机森林对照组SMOTE过采样合成少数违约样本平衡数据SMOTEENN混合采样过采样剔除噪声权重平衡不改动数据训练时加重少数类损失4. 网格搜索分层5折交叉验证批量训练4套模型以F1为优化目标输出召回率、精确率、AUC。5. 横向对比汇总所有方案指标自动选出F1最高的最优模型。6. 阈值优化不用默认0.5阈值通过PR曲线找到F1最佳分割点优化风控预测效果并绘图展示。7. 可选保存最优模型用于后续预测。#DAY 15 不平衡数据集的处理信贷数据集缺失值填充不平衡处理交叉验证超参数调优import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCVfrom sklearn.preprocessing import StandardScaler, LabelEncoderfrom sklearn.impute import SimpleImputerfrom sklearn.compose import ColumnTransformerfrom sklearn.pipeline import Pipelinefrom sklearn.ensemble import RandomForestClassifierfrom sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curvefrom imblearn.pipeline import Pipeline as ImbPipelinefrom imblearn.over_sampling import SMOTEfrom imblearn.combine import SMOTEENNimport warningswarnings.filterwarnings(“ignore”)中文绘图设置plt.rcParams[‘font.sans-serif’] [‘SimHei’, ‘Microsoft YaHei’, ‘DejaVu Sans’]plt.rcParams[‘axes.unicode_minus’] False1.读取数据file_path rC:\Python Study\Python60DaysChallenge-main\data.csvdata pd.read_csv(file_path)print(“数据集形状:”, data.shape)print(“\n数据集全部列名”)print(data.columns.tolist())TARGET_COL ‘Credit Default’分离特征标签X data.drop(columns[TARGET_COL])y data[TARGET_COL]区分数值列、文本分类列num_cols X.select_dtypes(include[np.number]).columns.tolist()cat_cols X.select_dtypes(include[‘object’]).columns.tolist()打印缺失值数量print(“\n各列缺失值统计”)print(X.isnull().sum())文本特征编码le_dict {}for col in cat_cols:le LabelEncoder()X[col] le.fit_transform(X[col].astype(str).fillna(“Missing”))le_dict[col] le数值缺失先简单填充预处理阶段临时处理X[num_cols] SimpleImputer(strategy“median”).fit_transform(X[num_cols])类别分布可视化print(“\n原始数据集目标变量分布:”)print(y.value_counts())plt.figure(figsize(6,4))sns.countplot(xy)plt.title(‘信贷违约类别分布’)plt.show()2.分层划分训练测试集X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42, stratifyy)print(f\n训练集分布: {pd.Series(y_train).value_counts().to_dict()}“)print(f测试集分布: {pd.Series(y_test).value_counts().to_dict()}”)3.基础配置base_clf RandomForestClassifier(random_state42)param_grid_common {‘classifier__n_estimators’: [50, 100],‘classifier__max_depth’: [5, 10],‘classifier__min_samples_split’: [2, 5]}cv StratifiedKFold(n_splits5, shuffleTrue, random_state42)4.四条流水线数据已提前填充无NaNSMOTE可正常运行基线pipeline_baseline ImbPipeline([(‘scaler’, StandardScaler()),(‘classifier’, base_clf)])param_baseline param_grid_common.copy()SMOTEpipeline_smote ImbPipeline([(‘scaler’, StandardScaler()),(‘sampler’, SMOTE(random_state42)),(‘classifier’, base_clf)])param_smote {**param_grid_common, ‘sampler__k_neighbors’: [3, 5]}SMOTEENNpipeline_smotenn ImbPipeline([(‘scaler’, StandardScaler()),(‘sampler’, SMOTEENN(random_state42)),(‘classifier’, base_clf)])param_smotenn param_grid_common.copy()权重平衡pipeline_weighted ImbPipeline([(‘scaler’, StandardScaler()),(‘classifier’, RandomForestClassifier(random_state42, class_weight‘balanced’))])param_weighted param_grid_common.copy()5.网格搜索函数def run_gridsearch(pipeline, param_grid, name):print(f\n{‘’*60}“)print(f正在运行策略: {name}”)print(f{‘’*60}“)gs GridSearchCV(pipeline, param_grid, cvcv, scoring‘f1’, n_jobs-1, verbose1)gs.fit(X_train, y_train)print(f最佳参数组合: {gs.best_params_}”)print(f交叉验证最佳 F1 (平均): {gs.best_score_:.4f})y_pred gs.best_estimator_.predict(X_test) test_f1 f1_score(y_test, y_pred) test_recall recall_score(y_test, y_pred) test_precision precision_score(y_test, y_pred) test_auc roc_auc_score(y_test, gs.best_estimator_.predict_proba(X_test)[:,1]) print(f测试集 F1: {test_f1:.4f}) print(f测试集召回率(Recall): {test_recall:.4f}) print(f测试集精确率(Precision): {test_precision:.4f}) print(f测试集 AUC: {test_auc:.4f}) print(\n分类报告:) print(classification_report(y_test, y_pred)) print(混淆矩阵:) print(confusion_matrix(y_test, y_pred)) return gs.best_estimator_, test_f1, test_recall, test_precision, test_auc6.批量训练对比results {}best_models {}strategies [(‘Baseline’, pipeline_baseline, param_baseline),(‘SMOTE’, pipeline_smote, param_smote),(‘SMOTEENN’, pipeline_smotenn, param_smotenn),(‘Weighted’, pipeline_weighted, param_weighted)]for name, pipe, params in strategies:model, f1, rec, prec, auc run_gridsearch(pipe, params, name)results[name] {‘F1’: f1, ‘Recall’: rec, ‘Precision’: prec, ‘AUC’: auc}best_models[name] model7.结果汇总print(“\n\n” “”*60)print(“各策略性能对比”)print(“”*60)df_results pd.DataFrame(results).Tprint(df_results.round(4))best_strategy df_results[‘F1’].idxmax()best_model best_models[best_strategy]print(f\n最优策略: {best_strategy}F1 {df_results.loc[best_strategy, ‘F1’]:.4f})8.阈值调优绘图print(“\n阈值微调”)y_proba best_model.predict_proba(X_test)[:, 1]precisions, recalls, thresholds precision_recall_curve(y_test, y_proba)fscores 2 * (precisions * recalls) / (precisions recalls 1e-9)ix np.argmax(fscores[:-1])best_threshold thresholds[ix]print(f最优阈值: {best_threshold:.4f}“)print(f对应F1:{fscores[ix]:.4f} 召回:{recalls[ix]:.4f} 精确:{precisions[ix]:.4f}”)y_pred_new (y_proba best_threshold).astype(int)print(“\n调整阈值后分类报告”)print(classification_report(y_test, y_pred_new))绘图plt.figure(figsize(12, 5))plt.subplot(1, 2, 1)plt.plot(thresholds, precisions[:-1], ‘–’, label‘Precision’)plt.plot(thresholds, recalls[:-1], ‘:’, label‘Recall’)plt.plot(thresholds, fscores[:-1], linewidth2, label‘F1’)plt.scatter(best_threshold, fscores[ix], c‘red’, s100)plt.xlabel(“Threshold”)plt.grid(True)plt.legend()plt.subplot(1, 2, 2)plt.plot(recalls, precisions)plt.scatter(recalls[ix], precisions[ix], c‘red’, s100)plt.xlabel(“Recall”)plt.ylabel(“Precision”)plt.grid(True)plt.tight_layout()plt.show()print(“全部执行完毕”)

相关新闻

计算机网络（2）

【独家首发】基于遗传算法GA-GMDH的风电数据回归预测研究Matlab实现

java: Singleton Pattern

从 KV Cache 到分布式状态机设计，一文讲透 AI Agent 的底层运行机制

大模型企业级智能体产品对比：百度、阿里、腾讯、华为、字节、实在智能哪家强？

前端转行Agent开发，我写了一个企业级开源项目，附教程

镇江高口碑黄金回收白银回收

郑州高口碑黄金回收白银回收

2026年选圣熙民宿托管，专业团队让你当甩手掌柜

中文大模型选型不是比参数，而是做工程化决策

STM32与LENA-R8构建全球定位与通信嵌入式系统

含金量高的EMBA｜2026国内及境外中英双语EMBA综合实力TOP5榜单

工业4-20mA电流环信号传输与XTR116应用设计

YOLO目标检测实战：从环境搭建到模型部署的保姆级教程

从论文到实践：一维卷积神经网络在RUL预测中的复现与调优

别再死记硬背了！用‘分界线’思维彻底搞懂C++ set的lower_bound和upper_bound

TwitchDropsMiner：无需观看直播，自动化获取Twitch掉落奖励的终极指南

从提示工程到上下文工程：2026年AI开发者的核心技能转换