R/mult_class.R
mkCrossFrameMExperiment.Rd
Please see vignette("MultiClassVtreat", package = "vtreat")
https://winvector.github.io/vtreat/articles/MultiClassVtreat.html.
mkCrossFrameMExperiment( dframe, varlist, outcomename, ..., weights = c(), minFraction = 0.02, smFactor = 0, rareCount = 0, rareSig = 1, collarProb = 0, codeRestriction = NULL, customCoders = NULL, scale = FALSE, doCollar = FALSE, splitFunction = vtreat::kWayCrossValidation, ncross = 3, forceSplit = FALSE, catScaling = FALSE, y_dependent_treatments = c("catB"), verbose = FALSE, parallelCluster = NULL, use_parallel = TRUE, missingness_imputation = NULL, imputation_map = NULL )
dframe | data to learn from |
---|---|
varlist | character, vector of indpendent variable column names. |
outcomename | character, name of outcome column. |
... | not used, declared to forced named binding of later arguments |
weights | optional training weights for each row |
minFraction | optional minimum frequency a categorical level must have to be converted to an indicator column. |
smFactor | optional smoothing factor for impact coding models. |
rareCount | optional integer, allow levels with this count or below to be pooled into a shared rare-level. Defaults to 0 or off. |
rareSig | optional numeric, suppress levels from pooling at this significance value greater. Defaults to NULL or off. |
collarProb | what fraction of the data (pseudo-probability) to collar data at if doCollar is set during |
codeRestriction | what types of variables to produce (character array of level codes, NULL means no restriction). |
customCoders | map from code names to custom categorical variable encoding functions (please see https://github.com/WinVector/vtreat/blob/main/extras/CustomLevelCoders.md). |
scale | optional if TRUE replace numeric variables with regression ("move to outcome-scale"). |
doCollar | optional if TRUE collar numeric variables by cutting off after a tail-probability specified by collarProb during treatment design. |
splitFunction | (optional) see vtreat::buildEvalSets . |
ncross | optional scalar>=2 number of cross-validation rounds to design. |
forceSplit | logical, if TRUE force cross-validated significance calculations on all variables. |
catScaling | optional, if TRUE use glm() linkspace, if FALSE use lm() for scaling. |
y_dependent_treatments | character what treatment types to build per-outcome level. |
verbose | if TRUE print progress. |
parallelCluster | (optional) a cluster object created by package parallel or package snow. |
use_parallel | logical, if TRUE use parallel methods. |
missingness_imputation | function of signature f(values: numeric, weights: numeric), simple missing value imputer. |
imputation_map | map from column names to functions of signature f(values: numeric, weights: numeric), simple missing value imputers. |
a names list containing cross_frame, treat_m, score_frame, and fit_obj_id
# numeric example set.seed(23525) # we set up our raw training and application data dTrainM <- data.frame( x = c('a', 'a', 'a', 'a', 'b', 'b', NA, NA), z = c(1, 2, 3, 4, 5, NA, 7, NA), y = c(0, 0, 0, 1, 0, 1, 2, 1)) dTestM <- data.frame( x = c('a', 'b', 'c', NA), z = c(10, 20, 30, NA)) # we perform a vtreat cross frame experiment # and unpack the results into treatmentsM, # dTrainMTreated, and score_frame unpack[ treatmentsM = treat_m, dTrainMTreated = cross_frame, score_frame = score_frame ] <- mkCrossFrameMExperiment( dframe = dTrainM, varlist = setdiff(colnames(dTrainM), 'y'), outcomename = 'y', verbose = FALSE) # the score_frame relates new # derived variables to original columns score_frame[, c('origName', 'varName', 'code', 'rsq', 'sig', 'outcome_level')] %.>% print(.)#> origName varName code rsq sig outcome_level #> 1 x x_catP catP 0.188721876 0.14797596 0 #> 2 x x_lev_NA lev 0.311278121 0.06316822 0 #> 3 x x_lev_x_a lev 0.188721876 0.14797596 0 #> 4 x x_lev_x_b lev 0.000000000 1.00000000 0 #> 5 z z clean 0.247063496 0.09786343 0 #> 6 z z_isBAD isBAD 0.311278121 0.06316822 0 #> 7 x x_catP catP 0.051124479 0.46195519 1 #> 8 x x_lev_NA lev 0.016462246 0.67635930 1 #> 9 x x_lev_x_a lev 0.051124479 0.46195519 1 #> 10 x x_lev_x_b lev 0.016462246 0.67635930 1 #> 11 z z clean 0.001915229 0.88677805 1 #> 12 z z_isBAD isBAD 0.489208457 0.02287099 1 #> 13 x x_catP catP 0.253742459 0.21616627 2 #> 14 x x_lev_NA lev 0.540072931 0.07117407 2 #> 15 x x_lev_x_a lev 0.253742459 0.21616627 2 #> 16 x x_lev_x_b lev 0.103111281 0.43045776 2 #> 17 z z clean 1.000000000 0.01407811 2 #> 18 z z_isBAD isBAD 0.103111281 0.43045776 2 #> 19 x X0_x_catB catB 0.001226957 0.90713685 0 #> 20 x X1_x_catB catB 0.013810388 0.70220993 1 #> 21 x X2_x_catB catB 0.540072932 0.07117407 2# the treated frame is a "cross frame" which # is a transform of the training data built # as if the treatment were learned on a different # disjoint training set to avoid nested model # bias and over-fit. dTrainMTreated %.>% head(.) %.>% print(.)#> x_catP z z_isBAD x_lev_NA x_lev_x_a x_lev_x_b X0_x_catB X1_x_catB #> 1 0.50 1.000000 0 0 1 0 0.2876487 -0.2876487 #> 2 0.50 2.000000 0 0 1 0 1.0985456 -0.2876487 #> 3 0.50 3.000000 0 0 1 0 9.9035376 -9.2104154 #> 4 0.50 4.000000 0 0 1 0 9.9035376 -9.2104154 #> 5 0.25 5.000000 0 0 0 1 0.0000000 0.0000000 #> 6 0.25 3.666667 1 0 0 1 0.0000000 0.0000000 #> X2_x_catB y #> 1 -8.650835 0 #> 2 -8.922767 0 #> 3 -8.294180 0 #> 4 -8.294180 1 #> 5 0.000000 0 #> 6 0.000000 1# Any future application data is prepared with # the prepare method. dTestMTreated <- prepare(treatmentsM, dTestM, pruneSig=NULL) dTestMTreated %.>% head(.) %.>% print(.)#> x_catP z z_isBAD x_lev_NA x_lev_x_a x_lev_x_b X0_x_catB X1_x_catB #> 1 0.5000 10.000000 0 0 1 0 1.098546 -0.5877333 #> 2 0.2500 20.000000 0 0 0 1 0.000000 0.5108123 #> 3 0.0625 30.000000 0 0 0 0 0.000000 0.0000000 #> 4 0.2500 3.666667 1 1 0 0 -9.903538 0.5108123 #> X2_x_catB #> 1 -8.650835 #> 2 -7.957713 #> 3 0.000000 #> 4 1.945824