使用 WSL 进行 +分类+特征重要性 简单实践
银行需要面对数量不断上升的欺诈案件 。随着新技术的出现,欺诈事件的实例将会成倍增加,银行很难检查每笔交易并手动识别欺诈模式 。RPA使用“if-then”方法识别潜在的欺诈行为并将其标记给相关部门 。例如,如果在短时间内进行了多次交易, RPA会识别该账户并将其标记为潜在威胁 。这有助于银行仔细审查账户并调查欺诈行为 。
正如我们在上面看到的,我们接收我们的输入,包括关于金融数据中个人保险索赔的数据(这些包含索赔特征、客户特征和保险特征) 。
经过一些预处理和添加新的特征,我们使用数据来训练分类器 。
在分类器被训练之后,它可以用来确定新记录是否被接受(不欺诈)或被拒绝(欺诈) 。
下面将更详细地描述该过程的流程 。
我们首先做一些初始的预处理,将数据字段转换成合适的格式 。然后,基于输入,我们生成特征,这些特征基于以前索赔次数、以前欺诈发生次数、索赔总额等因素来描述客户 。这些客户细分特征与详细说明警告代码存在(或缺乏)的特征一起添加到现有数据集中,诊断代码等 。
是一个梯度增强决策树的实现,旨在提高速度和性能 。算法的实现是为了提高计算时间和内存资源的效率而设计的 。设计目标是充分利用现有资源来训练模型 。我们使用分类器来确定索赔是否具有欺诈性 。
# Imports and Initializationfrom xgboost import XGBClassifier, plot_importancefrom sklearn.model_selection import RandomizedSearchCVfrom pyspark.sql import functions as Ffrom pyspark.sql.types import *from sklearn import preprocessingimport sysimport osimport numpy as npimport pandas as pdfrom sklearn.metrics import precision_recall_curvefrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import average_precision_score, confusion_matrix, accuracy_scorefrom pyspark.sql import SparkSessionfrom pyspark import SparkConffrom pyspark.sql.types import *from pyspark.sql import functions as Ffrom pyspark.storagelevel import StorageLevelimport jsonimport mathimport numbersfrom pyspark.sql import SQLContextfrom pyspark.sql import Windowimport matplotlib.pyplot as pltimport itertoolsfrom sklearn.metrics import roc_curvefrom sklearn.metrics import roc_auc_scoreday_of_week_udf = F.udf(lambda ts: ts.weekday() if ts is not None else None,StringType())def getSummary(df):summarydf = (df.withColumn('is_BEN_TYPE_Applicant',F.when(F.col("BEN_TYPE") == "Applicant", F.lit(1)).otherwise(F.lit(0))).groupby("MBR_NO").agg(F.max("NO_OF_YR").alias("MAX_NO_OF_YR"),F.sum(F.lit(1)).alias("NUM_LINES"),).withColumn("FRAC_REJECTED_AMT", F.col("TOT_REJECTED_AMT")/F.col("TOT_ORG_PRES_AMT_VALUE")).withColumn("FRAC_BEN_TYPE_Applicant", F.col("TOT_is_BEN_TYPE_Applicant")/F.col("NUM_LINES")).withColumn("FRAC_BEN_TYPE_Spouse", F.col("TOT_is_BEN_TYPE_Spouse")/F.col("NUM_LINES")).withColumn("FRAC_BEN_TYPE_Child", F.col("TOT_is_BEN_TYPE_Child")/F.col("NUM_LINES")).withColumn("FRAC_BEN_TYPE_Parent", F.col("TOT_is_BEN_TYPE_Parent")/F.col("NUM_LINES")).persist())return summarydfdef plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):"""This function prints and plots the confusion matrix.Normalization can be applied by setting `normalize=True`."""if normalize:cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]print("Normalized confusion matrix")else:print('Confusion matrix, without normalization')print(cm)plt.imshow(cm, interpolation='nearest', cmap=cmap)plt.title(title)plt.colorbar()tick_marks = np.arange(len(classes))plt.xticks(tick_marks, classes, rotation=45)plt.yticks(tick_marks, classes)fmt = '.2f' if normalize else 'd'thresh = cm.max() / 2.for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):plt.text(j, i, format(cm[i, j], fmt),horizontalalignment="center",color="white" if cm[i, j] > thresh else "black")plt.tight_layout()plt.ylabel('True label')plt.xlabel('Predicted label')def checkContain(baseFeatures, allFeatures, transformsList):"""Description : Used to indicate that we want to use the transformed featuresand not the original features if a transform has been doneInput : baseFeatures - Features included in the dataset before anytransforms are appliedallFeatures - All the features present in the dataframe after alltransforms and prep have completedtransformsList - The features from the original dataset that shouldhave transforms applied to themOutput : List of the features we're going to use for the ML model"""resList = []for baseFeat in baseFeatures:if baseFeat not in transformsList:resList.append(baseFeat)else:for feat in allFeatures:if baseFeat in feat:if "~~" in feat or "log10" in feat:resList.append(feat)return resListdef transform_ts_fields(df, ts_cols):"""Description : Produces a timestamp in the standard dow-hod format for thesupplied fieldInput : df - dataframets_cols - timestamp features that need to be formatted correctlyOutput : dataframe with appropriately formatted timestamp features"""col_list = df.columnsfor col in ts_cols:if(col in col_list):df = (df .withColumn(col,F.col(col).cast("timestamp")) .withColumn("{}_dow".format(col),day_of_week_udf(F.col(col))) .withColumn("{}_hod".format(col),F.hour(F.col(col))))return dfdef transform_numeric_fields(df, num_cols):"""Description : Converts all numeric fields into float typeInput : df - dataframenum_cols - numeric features that need to be converted to float typeOutput : dataframe with appropriately numerical features converted to floattype"""col_list = df.columnsfor col in num_cols:if(col in col_list):df = (df.withColumn(col, F.col(col).cast("float")))return dfdef transform_log_fields(df, num_cols):"""Description : Produces the log_10 of the fields passed to itInput : df - dataframenum_cols - numeric features whose log values need to be calculatedOutput : dataframe with added log values for the required numericalfeatures"""col_list = df.columnsfor col in num_cols:if(col in col_list):df = (df.withColumn(col + "_log10", F.log(10.0, F.col(col))))return dfdef with_transform(df, param_dict):"""Description : Applies transforms on relevant data fields in the dataInput : df - dataframeparam_dict - parameter dictionaryOutput : dataframe with all appropriate transforms"""df = transform_ts_fields(df, param_dict['BASE_FEATURES_TIMESTAMP'])df = transform_numeric_fields(df, param_dict['BASE_FEATURES_NUMERIC'])df = transform_log_fields(df, param_dict['LOG_TRANSFORM_FEATURES'])df = (df .withColumn("INCUR_PERIOD_SECS",F.col("INCUR_DATE_TO").cast("long") -F.col("INCUR_DATE_FROM").cast("long")))return dfdef run_xgboost(data,feats, scale_pos_weight=1.0, old_model = None):"""Description : Generates an xgboost model based on training dataInput : X_train_pd - Pandas Dataframe, training data inputy_train - training data output/labelsparam_dict - parameter dictionarymax_depth_list - list of max depths of treesn_estimators_list - list of number of treesscoring_metric - scoring metric usedgrid_scoring - grid scoring metricscale_pos_weight - weight applied to positive valsnum_cv = cross-validation splitting strategyOutput : Trained XGBoost Classifier"""X_train, X_test, y_train, y_test = train_test_split(data[feats], data['label'], test_size=0.33)unique, counts = np.unique(y_train, return_counts=True)cdict = dict(zip(unique, counts))temp_pos_weight = cdict[0]/cdict[1]xgb_class = XGBClassifier(scale_pos_weight=temp_pos_weight)xgb_class.fit(X=X_train, y=y_train, xgb_model = old_model)y_pred_proba = xgb_class.predict_proba(X_test)threshs = np.arange(0.01,1,0.01)acc = 0prsum = 0abdist = 1bestthresh = 0for thresh in threshs:y_pred_temp = (y_pred_proba[:,1] >= thresh).astype(int)'''precision, recall, thresholds = precision_recall_curve(y_test, y_pred_temp)average_precision = average_precision_score(y_test, y_pred_temp)if ((precision[1]+recall[1])>prsum) and (recall[1]>precision[1]):prsum = precision[1]+recall[1]bestthresh = thresh''''''temp_acc = accuracy_score(np.array(y_test), np.array(y_pred_temp))if temp_acc >acc:acc = temp_accbestthresh = thresh'''cnf_matrix_temp = confusion_matrix(y_test, y_pred_temp)cm = cnf_matrix_temp.astype('float') / cnf_matrix_temp.sum(axis=1)[:, np.newaxis]fp = cm[0][1] * 1.0fn = cm[1][0] * 1.0dist = abs((fn/fp)-1)if dist