新浪博客

SAS EM:变量转换结点(Transform Variable Node)

2010-08-07 14:09阅读:
SAS EM:变量转换结点(Transform Variable Node)
SAS EM(Enterprise Miner)企业数据挖掘节点功能详解及代码实现(第八弹)
本文未经作者允许,请勿转载
变量转换结点(Transform Variable Node)提供各种衍生变量的产生功能,数值数据转置等。变量转换结点允许你透过转换在数据中已存在的变量建立新的变量。举例来说,你可以在变量中稳定变异数、移除非线性和更正非正态分布的数据,有几种转换的型态:
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
转换为三种方法:
基本转换:
 Log:取对数。
 Square root:取平方根。
 Inverse:取倒数。
 Square:取平方。
 Exponential:取指数。
 Standardize:标准化。
Binning转换即连续数据分箱:
 Bucket:将数据依照同大小的宽度分成 n 个区间,每个区间内的数据个数通常会不一样。
 Quantile:将数据依照数据个数分成 n 个区间,每个区间内的数据各
数会相同。
Best power transforms:最优次方转换
 Optimal binning for relationship to target:根据目标去优化区间。
 Maximize normality:最大化正态分布。
 Maximize correlation with target:最大化与目标的相关系数。
 Equalize spread with target levels:使与目标的区间相同。
原始变量或是转换变量展现的字段包括:
 Name:原始变量或是转换变量的名称。
 Keep:保留变数做为输出。
 Mean:平均值。
 Std Dev:标准偏差。
 Skew:歪斜值,如果为正,表示在平均值右边的宽度比左边大;如果为负,则表示平均值右边的宽度比左边小。
 Kurtosis:针对分布的形状的测量值,大的值表示含有一些资料距离平均值较远。
 C.V.:共变异数。
 Formula:转换的公式。
 Format:变数的格式。
 Label:变量的卷标。
变量转换结点(Transform Variable Node)
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
设置目标变量
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
对变量进行转换
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
变量转换结果
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
代码实现如下:
%let DM_SEED = 12345;
libname SAMPSIO list;

data EMDATA.VIEW_KXX / view=EMDATA.VIEW_KXX;
set EMSAMPLE.DMAGECR;
run;

data EMPROJ.SMP_VIIA /view=EMPROJ.SMP_VIIA;
set EMSAMPLE.DMAGECR;
run;
proc sql noprint;
select count(*) into :_tmpa
from sashelp.vstabvw
where libname = 'EMSAMPLE' and
upcase(memname) = upcase('DMAGECR');
quit;
data EMPROJ.SMP_XGPV/view=EMPROJ.SMP_XGPV;
set EMPROJ.SMP_VIIA;
run;
data EMDATA.TRNTSZ2K/view=EMDATA.TRNTSZ2K;
set EMDATA.VIEW_KXX;
run;
**这里,对AMOUNT变量按Maximize normality(最大化正态分布)的要求来进行变量转换;
SAS <wbr>EM:变量转换结点(Transform <wbr>Variable <wbr>Node)
*这里一共提供了以下几种变量转换方式,然后找出最满足正态分布的转换方式作为最终的转换方式:求自然对数,1/4次方,1/2次方,平方,4次方,E的X次方等(log(x),x1/4,sqrt(x),x2,x4,ex);
* AMOUNT ;
data _trntmp(keep=AMOUNT _logvar _rt4var _sqrtvar _sqrvar _pwr4var _expvar);
set EMPROJ.SMP_VIIA;
if AMOUNT + 0 > 0 then _logvar = log(AMOUNT + 0);
else _logvar = .;
_rt4var = (AMOUNT + 0) ** 0.25;
_sqrtvar = sqrt((AMOUNT + 0));
_sqrvar = (AMOUNT + 0)**2;
_pwr4var = (AMOUNT + 0)**4;
_expvar = exp((AMOUNT + 0)/184.24);
RUN;
**标准化;
proc standard data=_trntmp
out =_trnstd mean=0 std=1;
RUN;
proc sort data=_trnstd;
by AMOUNT;
run;
**先生成一个正态分布变量;
data _trnstd;
set _trnstd;
normval = probit(_n_/(1000+1));
run;
**候选转换变量与正态分布变量求相关性;
proc corr data=_trnstd outp=_indtrn noprint;
var AMOUNT _logvar _rt4var _sqrtvar _sqrvar _pwr4var _expvar;
with normval;
run;
data _modtmp(keep=_power _val);
set _indtrn;
if _type_ = 'CORR' then do;
_power = 0;
_val = ((2*_logvar + _rt4var)/3)**2;
output;
_power = .25;
_val = ((_logvar + 2*_rt4var + _sqrtvar)/4)**2;
output;
_power = .5;
_val = ((_rt4var + 2*_sqrtvar + AMOUNT)/4)**2;
output;
_power = 1;
_val = ((_sqrtvar + 2*AMOUNT + _sqrvar)/4)**2;
output;
_power = 2;
_val = ((AMOUNT + 2*_sqrvar + _pwr4var)/4)**2;
output;
_power = 4;
_val = ((_sqrvar + 2*_pwr4var + _expvar)/4)**2;
output;
_power = 10;
_val = ((_pwr4var + 2*_expvar)/3)**2;
output;
end;
run;
proc sort;
by descending _val;
run;
%let _tmpa=1;
proc sql;
reset noprint;
select _power into :_tmpa
from _modtmp;
quit;
run;
proc datasets lib=work nolist;
delete _trntmp _modtmp _indtrn;
run;
quit;
proc format lib=WORK;
value AGE_1BY_ low-33 ='0001:low-33'
33-47 ='0002:33-47'
47-61 ='0003:47-61'
61-high='0004:61-high';
run;

data _trntmp(keep=DURATION GOOD_BAD _logvar _rt4var _sqrtvar _sqrvar _pwr4var _expvar);
set EMPROJ.SMP_VIIA;
if DURATION + 0 > 0 then _logvar = log(DURATION + 0);
else _logvar = .;
_rt4var = (DURATION + 0) ** 0.25;
_sqrtvar = sqrt((DURATION + 0));
_sqrvar = (DURATION + 0)**2;
_pwr4var = (DURATION + 0)**4;
_expvar = exp((DURATION + 0)/1);
RUN;

proc standard data=_trntmp out=_trnstd mean=0 std=1;
run;
proc summary data=_trnstd;
class GOOD_BAD;
var DURATION _logvar _rt4var _sqrtvar _sqrvar _pwr4var _expvar;
output out=_indtrn std=;
run;
proc summary data=_indtrn;
where _type_=1;
var DURATION _logvar _rt4var _sqrtvar _sqrvar _pwr4var _expvar;
output out=_indtrn std=;
run;
data _modtmp(keep=_power _val);
set _indtrn;
if _type_ = 0 then do;
_power = 0;
_val = ((2*_logvar + _rt4var)/3)**2;
output;
_power = .25;
_val = ((_logvar + 2*_rt4var + _sqrtvar)/4)**2;
output;
_power = .5;
_val = ((_rt4var + 2*_sqrtvar + DURATION)/4)**2;
output;
_power = 1;
_val = ((_sqrtvar + 2*DURATION + _sqrvar)/4)**2;
output;
_power = 2;
_val = ((DURATION + 2*_sqrvar + _pwr4var)/4)**2;
output;
_power = 4;
_val = ((_sqrvar + 2*_pwr4var + _expvar)/4)**2;
output;
_power = 10;
_val = ((_pwr4var + 2*_expvar)/3)**2;
output;
end;
run;
proc sort;
by _val;
run;
%let _tmpa=1;
proc sql;
reset noprint;
select _power into :_tmpa
from _modtmp;
quit;
run;
proc datasets lib=work nolist;
delete _trntmp _modtmp _indtrn;
run;
quit;
**这里进行基本转换;
data EMDATA.TRNTSZ2K/view=EMDATA.TRNTSZ2K;
set EMDATA.VIEW_KXX;
drop DURATION;
*;
format DURA_BF9 BEST12.;
label DURA_BF9='DURATION: Equalize spread among GOOD_BAD';
if DURATION > 0 then
DURA_BF9=log(DURATION);
else DURA_BF9 = .;
*;
format DURA_QPD BEST12.;
label DURA_QPD='standardize(DURATION)';
DURA_QPD=(DURATION - 20.903) / 12.05881;
drop AMOUNT;
*;
format AMOU_NQU BEST12.;
label AMOU_NQU='square(AMOUNT)';
AMOU_NQU=(AMOUNT)**2;
*;
format AMOU_8V9 BEST12.;
label AMOU_8V9='inverse(AMOUNT)';
AMOU_8V9=1/(AMOUNT);
*;
format AMOU_RQU BEST12.;
label AMOU_RQU='AMOUNT: Maximize normality';
if AMOUNT > 0 then
AMOU_RQU=log(AMOUNT);
else AMOU_RQU = .;
drop AGE;
*;
format AGE_1BYU AGE_1BY_17.;
label AGE_1BYU='Bucket(AGE)';
AGE_1BYU=AGE;
run;
还有一种为BIN转换,方法与变量选择时的方法类似,大家自己去研究吧。
本文用到的SAS数据集为dmagecr.sas7bdat,其下载地址:
http://ishare.iask.sina.com.cn/f/8641122.html
本系列全部数据下载地址:
http://iask.sina.com.cn/u/1564153724/ish

我的更多文章

下载客户端阅读体验更佳

APP专享