数值变量卡方分箱-R版本

原创
10/24 20:40
阅读数 929

      小编近期接的项目中很多要求要用R来做(小编可是Python高手!),所以基本上将Python代码用R重写(翻译)了一遍。小编差不多三年没正儿八经地用R了,但R是小编最喜欢也是最早使用的统计工具。关于工具:Python、R和SAS,小编都能用一点点(生活所迫!),所以有关工具使用、建模、分析的,欢迎交流;有项目合作的,也欢迎交流!

      卡方分箱的概念,请参考数值变量-卡方分箱。当然可以将其中的分箱标准-卡方改写成基尼、信息熵等。

算法

这里把具体的实现算法再重述一遍:

(1)将数值变量按照等距方式分成SplitNum段(比如100段),此为初始分箱

(2)计算每段的总样本数、好样本数、坏样本数、样本占比等统计值;

(3)计算相邻两段的卡方值,合并卡方值最小的相邻两段;

(4)重复步骤(2)和(3),直至分段个数<=BinMax;

(5)检查每段是否同时含有坏样本和好样本,若某段只含有坏样本或好样本,则将与该段卡方值最小的相邻一段和该段进行合并;

(6)重复步骤(5),直至每段同时含有坏样本和好样本;

(7)检查每段的样本占比是否>=BinPcntMin,若某段的样本占比<BinPcntMin,则将与该段卡方值最小的相邻一段和该段进行合并;

(8)重复步骤(7),直至每段的样本占比>=BinPcntMin。

R代码

1、数据列等距分割函数:

splitCol <- function(col, SplitNum, exclude_attr=NULL){
    # Copyright by 小石头(bigdata_0819@163.com)
    # col: 数据列
    # SplitNum: 等距分割的段数
    # exclude_attri: 不参与分割的值
    # return: 分割节点值列表
    
    col <- setdiff(col, exclude_attr)
    splitPoint <- seq(min(col), max(col), length=SplitNum+1)
    splitPoint[length(splitPoint)] <- Inf
    splitPoint <- splitPoint[-1]
    
    return(splitPoint)
}

assignSplit <- function(x, splitPoint){
    # Copyright by 小石头(bigdata_0819@163.com)
    # x: 标量数值
    # splitPoint:分割节点向量
    # return:落入的分割段
    
    if(x <= splitPoint[1]){
        sP <- splitPoint[1]
    }else{
        for(i in 1:(length(splitPoint)-1)){
            if(x>splitPoint[i] && x<=splitPoint[i+1]){
                sP <- splitPoint[i+1]
            }
        }
    }
    return(sP)
}

2、计算变量总样本、好样本、坏样本、坏样本率的函数:

BinBadRate <- function(df, col, target, BadRateIndicator){
    # Copyright by 小石头(bigdata_0819@163.com)
    # df: 需要计算好坏比率的数据集
    # col: 需要计算好坏比率的变量
    # target: 好坏标签
    # BadRateIndicator: 是否计算好坏比
    
    group1 <- aggregate(df[, target], list(df[, col]), sum)
    colnames(group1) <- c(col, 'bad')
    group2 <- aggregate(df[, target], list(df[, col]), length)
    colnames(group2) <- c(col, 'total')
    group <- merge(group1, group2, by=col)
    group$good <- group$total-group$bad
    group <- group[c(col, 'bad', 'good', 'total')]
    
    if(BadRateIndicator){
        group$BadRate <- group$bad/group$total
    }
    
    return(group)
}

3、计算卡方值函数:

calcChi2 <- function(df, total_col, bad_col, good_col){
    # Copyright by 小石头(bigdata_0819@163.com)
    # df: 包含各属性的全部样本个数、坏样本个数、好样本个数的数据框
    # total_col: 全部样本的个数
    # bad_col: 坏样本的个数
    # good_col:好样本的个数
    
    badRate <- sum(df[, bad_col])/sum(df[, total_col])
    goodRate <- sum(df[, good_col])/sum(df[, total_col])
    
    if(badRate %in% c(0,1)){
        return(0)
    }
    
    df$badExp <- df[, total_col]*badRate
    df$goodExp <- df[, total_col]*goodRate
    
    badChi2 <- sum((df[, bad_col]-df$badExp)^2/df$badExp)
    goodChi2 <- sum((df[, good_col]-df$goodExp)^2/df$goodExp)
    
    Chi2 <- badChi2 + goodChi2
    
    return(Chi2)
}

4、接下来实现单变量分箱的函数,其中会调用上面的几个函数,返回单变量分箱的结果。分箱函数分三个部分,(1)合并相邻两个分组、(2)检查是否每个分组同时含有好和坏、(3)检查每个分组的占比是否大于等于BinPcntMin。其中spe_attri是特殊属性值,初始分箱时将各特殊属性值分别单独作为一组,singleIndicator是特殊属性值在接下来的合并过程中是否参与合并的标识,取值T,不参与合并,取值F,则参与合并:

ContVarChi2Bin <- function(df, col, target, BinMax, BinPcntMin, SplitNum, spe_attri=NULL, singleIndicator){
    # Copyright by 小石头(bigdata_0819@163.com)
    # df: 包含目标变量与分箱变量的数据框
    # col: 需要分箱的变量
    # target: 目标变量,取值0或1
    # BinMax: 最大分箱数
    # BinPcntMin:每箱的最小占比
    # SplitNum:数值变量初始切分的段数,初始将变量等距切分成SplitNum段
    # spe_attri:特殊属性
    # singleIndicator: T:特殊属性单独作为一组不参与卡方分箱,F:特殊属性作为一组参与卡方分箱
    
    if(length(spe_attri)>=1){
        df1 <- df[df[, col] %in% spe_attri, ]
        df2 <- df[!df[, col] %in% spe_attri, ]
    }else{
        df2 <- df
    }
    
    split_col <- splitCol(df2[, col], SplitNum)
    df2$temp <- apply(df2[col], 1, assignSplit, split_col)
    binBadRate <- BinBadRate(df2, 'temp', target, BadRateIndicator=F)
    
    if(length(spe_attri)>=1 && singleIndicator==F && nrow(df1)>0){
        df1$temp <- df1[, col]
        binBadRate1 <- BinBadRate(df1, 'temp', target, BadRateIndicator=F)
        binBadRate <- rbind(binBadRate1, binBadRate)
    }
    if(length(spe_attri)>=1 && singleIndicator==T){
        BinMax <- BinMax-length(unique(df1[, col]))
    }
        
    # 1、迭代合并相邻两个组,直至分箱数<=BinMax
    while(nrow(binBadRate)>BinMax){
        chi2_ <- NULL
        for(i in 1:(nrow(binBadRate)-1)){
            temp_binBadRate <- binBadRate[i:(i+1), ]
            chi2 <- calcChi2(temp_binBadRate, 'total', 'bad', 'good')
            chi2_ <- c(chi2_, chi2)
        }

        combineIndex <- which.min(chi2_)
        combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
        
        binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
        binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
        binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
        
        binBadRate = binBadRate[-combineIndex, ]
        rownames(binBadRate) <- NULL
    }
    
    # 2、检查每组是否同时含有好和坏
    binBadRate$BadRate <- binBadRate$bad/binBadRate$total
    minBadRate <- min(binBadRate$BadRate)
    maxBadRate <- max(binBadRate$BadRate)
    while(minBadRate==0 || minBadRate==1){
        BadRate_01 <-binBadRate[binBadRate$BadRate %in% c(0,1), ]
        index_01 <- as.numeric(rownames(BadRate_01)[1])
        
        if(index_01==1){
            combineIndex <- 1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else if(index_01==nrow(binBadRate)){
            combineIndex <- nrow(binBadRate)-1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else{
            temp1_binBadRate <- binBadRate[(index_01-1):index_01, ]
            chi2_1 <- calcChi2(temp1_binBadRate, 'total', 'bad', 'good')
            
            temp2_binBadRate <- binBadRate[index_01:(index_01+1), ]
            chi2_2 = calcChi2(temp2_binBadRate, 'total', 'bad', 'good')
            
            if(chi2_1 < chi2_2){
                combineIndex <- index_01-1
            }else{
                combineIndex <- index_01
            }
            
            combine_binBadRate <- binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate <- binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }
        binBadRate$BadRate <- binBadRate$bad/binBadRate$total
        minBadRate <- min(binBadRate$BadRate)
        maxBadRate <- max(binBadRate$BadRate)
    }
        
    # 3、检查每组的占比是否大于等于BinPcntMin
    binBadRate$Percent <- binBadRate$total/sum(binBadRate$total)
    minPercent <- min(binBadRate$Percent)
    while(minPercent<BinPcntMin){
        BadRate_minPercent <- binBadRate[binBadRate$Percent==minPercent, ]
        index_minPercent <- as.numeric(rownames(BadRate_minPercent)[1])
        
        if(index_minPercent==1){
            combineIndex <- 1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else if(index_minPercent==nrow(binBadRate)){
            combineIndex <- nrow(binBadRate)-1
            combine_binBadRate = binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate = binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }else{
            temp1_binBadRate <- binBadRate[(index_minPercent-1):index_minPercent, ]
            chi2_1 <- calcChi2(temp1_binBadRate, 'total', 'bad', 'good')
            
            temp2_binBadRate <- binBadRate[index_minPercent:(index_minPercent+1), ]
            chi2_2 = calcChi2(temp2_binBadRate, 'total', 'bad', 'good')
            
            if(chi2_1 < chi2_2){
                combineIndex <- index_minPercent-1
            }else{
                combineIndex <- index_minPercent
            }
            
            combine_binBadRate <- binBadRate[combineIndex:(combineIndex+1), ]
            
            binBadRate[combineIndex+1, 'total'] <- sum(combine_binBadRate$total)
            binBadRate[combineIndex+1, 'bad'] <- sum(combine_binBadRate$bad)
            binBadRate[combineIndex+1, 'good'] <- sum(combine_binBadRate$good)
            
            binBadRate <- binBadRate[-combineIndex, ]
            rownames(binBadRate) <- NULL
        }
        
        binBadRate$Percent <- binBadRate$total/sum(binBadRate$total)
        minPercent <- min(binBadRate$Percent)
    }
    
    binBadRate <- dplyr::select(binBadRate, -c('BadRate', 'Percent'))
    
    if(length(spe_attri)>=1 && singleIndicator==T && nrow(df1)>0){
        binBadRate_single <- BinBadRate(df1, col, target, BadRateIndicator=F)
        colnames(binBadRate_single)[1] <- 'temp'
        bindf <- rbind(binBadRate_single, binBadRate)
    }else{
        bindf <- binBadRate
    }
    colnames(bindf)[1] <- 'upper'
    
    bindf$Percent <- bindf$total/sum(bindf$total)
    bindf$BadRate <- bindf$bad/bindf$total
    
    bindf0 <- data.frame(bin=1:nrow(bindf), lower=c(-Inf, bindf$upper[-nrow(bindf)]))
    bindf <- cbind(bindf0, bindf)
    
    return(bindf)
}

5、批量分箱函数,将所有要分箱的数值变量进行批量分箱处理,函数返回的是存放每个变量分箱结果的列表list:

ContVarChi2BinBatch <- function(df, key, target, BinMax, BinPcntMin, SplitNum, spe_attri=NULL, singleIndicator){
    # Copyright by 小石头(bigdata_0819@163.com)
    # df: 数据框
    # key: 主键
    # target: 目标变量,取值0或1
    # return: 存放每个变量分箱结果的列表
    
    Xvars <- setdiff(colnames(df), c(key, target))
    
    list_bin <- list()
    for(col in Xvars){
        list_bin[[col]] <- ContVarChi2Bin(df, col, target, BinMax, BinPcntMin, SplitNum, spe_attri, singleIndicator)
    }
    
    return(list_bin)
}

6、将变量值替换为分箱值的函数:

txtContVarBin <- function(df, key, target, list_bin, testIndicator){
    # Copyright by 小石头(bigdata_0819@163.com)
    # df: 需要将变量值替换为分箱值的数据框
    # key:主键
    # target:目标变量
    # list_bin:包含各变量分箱结果的列表
    # testIndicator:是否为测试数据框,T:计算测试数据分箱后的占比、坏样本率等,并存放在列表中
    
    df_bin <- df[, c(key, target)]
    Xvars <- setdiff(colnames(df), c(key, target))
    ListBin <- list()
    
    for(col in Xvars){
        
        Bin <- list_bin[[col]]
        vec <- NULL
        for(i in Bin$bin){
            vec[df[, col]>Bin$lower[i] & df[, col]<=Bin$upper[i]] <- i
        }
        df_bin[, col] <- vec
        
        if(testIndicator){
            
            col_bin_BadRate <- BinBadRate(df_bin, col, target, BadRateIndicator=F)
            col_bin_BadRate$Percent <- col_bin_BadRate$total/sum(col_bin_BadRate$total)
            col_bin_BadRate$BadRate <- col_bin_BadRate$bad/col_bin_BadRate$total
            colnames(col_bin_BadRate)[1] <- 'bin'
            col_bin <- merge(Bin[c('bin', 'lower', 'upper')], col_bin_BadRate, by='bin', all.x=T)
            ListBin[[col]] <- col_bin
            
        }
    }
    
    if(testIndicator){
        return(list(df_bin=df_bin, ListBin=ListBin))
    }else{
        return(df_bin)
    }
}

代码的使用方法与文章数值变量-卡方分箱的方法完全一样,可参考。查看某变量的分箱结果如下:

      在几个不同数据集上运行验证都没出现bug,如果大家运行代码出现bug,欢迎交流学习。


传送门:

数值变量-卡方分箱

决策树分箱-R

本文分享自微信公众号 - 大数据建模的一点一滴(bigdatamodeling)。
如有侵权,请联系 support@oschina.cn 删除。
本文参与“OSC源创计划”,欢迎正在阅读的你也加入,一起分享。

展开阅读全文
打赏
1
0 收藏
分享
加载中
更多评论
打赏
0 评论
0 收藏
1
分享
返回顶部
顶部