数会相同。
Best power transforms:最优次方转换
Optimal binning for relationship to target:根据目标去优化区间。
Maximize normality:最大化正态分布。
Maximize correlation with target:最大化与目标的相关系数。
Equalize spread with target levels:使与目标的区间相同。
原始变量或是转换变量展现的字段包括:
Name:原始变量或是转换变量的名称。
Keep:保留变数做为输出。
Mean:平均值。
Std Dev:标准偏差。
Skew:歪斜值,如果为正,表示在平均值右边的宽度比左边大;如果为负,则表示平均值右边的宽度比左边小。
Kurtosis:针对分布的形状的测量值,大的值表示含有一些资料距离平均值较远。
C.V.:共变异数。
Formula:转换的公式。
Format:变数的格式。
Label:变量的卷标。
变量转换结点(Transform Variable Node)
设置目标变量
对变量进行转换
变量转换结果
代码实现如下:
%let DM_SEED = 12345;
libname SAMPSIO list;
data EMDATA.VIEW_KXX / view=EMDATA.VIEW_KXX;
set EMSAMPLE.DMAGECR;
run;
data EMPROJ.SMP_VIIA /view=EMPROJ.SMP_VIIA;
set EMSAMPLE.DMAGECR;
run;
proc sql noprint;
select count(*) into :_tmpa
from sashelp.vstabvw
where libname = 'EMSAMPLE' and
upcase(memname) = upcase('DMAGECR');
quit;
data EMPROJ.SMP_XGPV/view=EMPROJ.SMP_XGPV;
set EMPROJ.SMP_VIIA;
run;
data EMDATA.TRNTSZ2K/view=EMDATA.TRNTSZ2K;
set EMDATA.VIEW_KXX;
run;
**这里,对AMOUNT变量按Maximize normality(最大化正态分布)的要求来进行变量转换;
*这里一共提供了以下几种变量转换方式,然后找出最满足正态分布的转换方式作为最终的转换方式:求自然对数,1/4次方,1/2次方,平方,4次方,E的X次方等(log(x),x1/4,sqrt(x),x2,x4,ex);
* AMOUNT
;
data _trntmp(keep=AMOUNT _logvar _rt4var _sqrtvar _sqrvar _pwr4var
_expvar);
set EMPROJ.SMP_VIIA;
if AMOUNT + 0 > 0 then _logvar =
log(AMOUNT + 0);
else _logvar = .;
_rt4var
= (AMOUNT + 0) **
0.25;
_sqrtvar = sqrt((AMOUNT + 0));
_sqrvar
= (AMOUNT + 0)**2;
_pwr4var = (AMOUNT + 0)**4;
_expvar
= exp((AMOUNT +
0)/184.24);
RUN;
**标准化;
proc standard data=_trntmp
out
=_trnstd mean=0 std=1;
RUN;
proc sort data=_trnstd;
by AMOUNT;
run;
**先生成一个正态分布变量;
data _trnstd;
set _trnstd;
normval = probit(_n_/(1000+1));
run;
**候选转换变量与正态分布变量求相关性;
proc corr data=_trnstd outp=_indtrn noprint;
var AMOUNT _logvar _rt4var _sqrtvar _sqrvar
_pwr4var _expvar;
with normval;
run;
data _modtmp(keep=_power _val);
set _indtrn;
if _type_ = 'CORR' then do;
_power = 0;
_val = ((2*_logvar +
_rt4var)/3)**2;
output;
_power = .25;
_val = ((_logvar + 2*_rt4var
+ _sqrtvar)/4)**2;
output;
_power = .5;
_val = ((_rt4var +
2*_sqrtvar + AMOUNT)/4)**2;
output;
_power = 1;
_val = ((_sqrtvar + 2*AMOUNT
+ _sqrvar)/4)**2;
output;
_power = 2;
_val = ((AMOUNT + 2*_sqrvar
+ _pwr4var)/4)**2;
output;
_power = 4;
_val = ((_sqrvar +
2*_pwr4var + _expvar)/4)**2;
output;
_power = 10;
_val = ((_pwr4var +
2*_expvar)/3)**2;
output;
end;
run;
proc sort;
by descending _val;
run;
%let _tmpa=1;
proc sql;
reset noprint;
select _power into :_tmpa
from _modtmp;
quit;
run;
proc datasets lib=work nolist;
delete _trntmp _modtmp _indtrn;
run;
quit;
proc format lib=WORK;
value AGE_1BY_ low-33 ='0001:low-33'
33-47
='0002:33-47'
47-61
='0003:47-61'
61-high='0004:61-high';
run;
data _trntmp(keep=DURATION GOOD_BAD _logvar _rt4var _sqrtvar
_sqrvar _pwr4var _expvar);
set EMPROJ.SMP_VIIA;
if DURATION + 0 > 0 then _logvar =
log(DURATION + 0);
else _logvar = .;
_rt4var
= (DURATION + 0) **
0.25;
_sqrtvar = sqrt((DURATION + 0));
_sqrvar
= (DURATION + 0)**2;
_pwr4var = (DURATION + 0)**4;
_expvar
= exp((DURATION +
0)/1);
RUN;
proc standard data=_trntmp out=_trnstd mean=0 std=1;
run;
proc summary data=_trnstd;
class GOOD_BAD;
var DURATION _logvar _rt4var _sqrtvar
_sqrvar _pwr4var _expvar;
output out=_indtrn std=;
run;
proc summary data=_indtrn;
where _type_=1;
var DURATION _logvar _rt4var _sqrtvar
_sqrvar _pwr4var _expvar;
output out=_indtrn std=;
run;
data _modtmp(keep=_power _val);
set _indtrn;
if _type_ = 0 then do;
_power = 0;
_val = ((2*_logvar +
_rt4var)/3)**2;
output;
_power = .25;
_val = ((_logvar + 2*_rt4var
+ _sqrtvar)/4)**2;
output;
_power = .5;
_val = ((_rt4var +
2*_sqrtvar + DURATION)/4)**2;
output;
_power = 1;
_val = ((_sqrtvar +
2*DURATION + _sqrvar)/4)**2;
output;
_power = 2;
_val = ((DURATION +
2*_sqrvar + _pwr4var)/4)**2;
output;
_power = 4;
_val = ((_sqrvar +
2*_pwr4var + _expvar)/4)**2;
output;
_power = 10;
_val = ((_pwr4var +
2*_expvar)/3)**2;
output;
end;
run;
proc sort;
by _val;
run;
%let _tmpa=1;
proc sql;
reset noprint;
select _power into :_tmpa
from _modtmp;
quit;
run;
proc datasets lib=work nolist;
delete _trntmp _modtmp _indtrn;
run;
quit;
**这里进行基本转换;
data EMDATA.TRNTSZ2K/view=EMDATA.TRNTSZ2K;
set EMDATA.VIEW_KXX;
drop DURATION;
*;
format DURA_BF9 BEST12.;
label DURA_BF9='DURATION: Equalize spread
among GOOD_BAD';
if DURATION > 0 then
DURA_BF9=log(DURATION);
else DURA_BF9 = .;
*;
format DURA_QPD BEST12.;
label
DURA_QPD='standardize(DURATION)';
DURA_QPD=(DURATION - 20.903) /
12.05881;
drop AMOUNT;
*;
format AMOU_NQU BEST12.;
label AMOU_NQU='square(AMOUNT)';
AMOU_NQU=(AMOUNT)**2;
*;
format AMOU_8V9 BEST12.;
label AMOU_8V9='inverse(AMOUNT)';
AMOU_8V9=1/(AMOUNT);
*;
format AMOU_RQU BEST12.;
label AMOU_RQU='AMOUNT: Maximize
normality';
if AMOUNT > 0 then
AMOU_RQU=log(AMOUNT);
else AMOU_RQU = .;
drop AGE;
*;
format AGE_1BYU AGE_1BY_17.;
label AGE_1BYU='Bucket(AGE)';
AGE_1BYU=AGE;
run;
还有一种为BIN转换,方法与变量选择时的方法类似,大家自己去研究吧。
本文用到的SAS数据集为dmagecr.sas7bdat,其下载地址:
http://ishare.iask.sina.com.cn/f/8641122.html
本系列全部数据下载地址:
http://iask.sina.com.cn/u/1564153724/ish