Home>

I have x, x2, y datasets
When performing cross-validation to determine hyperparameters in X, y regression problems
I want to finally return to y with x being autoscaling and x2 being X = x + x2 without processing.
When I try the following, an error occurs at the stage of X = x + x2.

Probably, there is no data number when extracting x and x2 randomly, but how can I extract the same sample number from two data frames randomly and between data frames?

x:

No. a b
1 3.1 3.1 1.2
2 10.0 2.2
3 1.1 1.2
Four 3.5 3.2

x2:

No. c d
1 0 1
2 0 0
3 01
Four 1 0

When extracting at random, I want to unify only the data numbers of both x and x2 data.

def double_cross_validation (gs_cv, x, x2, y, outer_fold_number, do_autoscaling = True, random_state = 0):
    "" "
    Double Cross-Validation (DCV)
    Estimate y-values ​​in DCV
    Parameters
    ----------
    gs_cv: object of GridSearchCV (sklearn.model_selection.GridSearchCV)
        for more details, please go to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    x: numpy.array or pandas.DataFrame
        m x n matrix of X-variables of training data,
        m is the number of training sammples and
        n is the number of X-variables
    y: numpy.array or pandas.DataFrame
        m x 1 vector of a Y-variable of training data
    outer_fold_number: int
        Fold number in outer CV (fold number in inner CV is included in gs_cv)
    do_autoscaling: bool
        flag of autoscaling, if True, do autoscaling
    random_state: int
        random seed, if None, random seed is not set
    Returns
    -------
    estimated_y: numpy.array
        estimated y-values ​​in DCV
    "" "
    x = np.array (x)
    x2 = np.array (x2)
    y = np.array (y)
    # how to divide datase in outer CV
    min_number = math.floor (x.shape [0]/outer_fold_number)
    mod_number = x.shape [0] --min_number * outer_fold_number
    index = np.matlib.repmat (np.arange (1, outer_fold_number + 1, 1), 1, min_number) .ravel ()
    if mod_number! = 0:
        index = np.r_ [index, np.arange (1, mod_number + 1, 1)]
    if random_state! = None:
        np.random.seed (random_state)
    fold_index_in_outer_cv = np.random.permutation (index)
    np.random.seed (0)
    estimated_y = np.zeros (len (y))
    for fold_number_in_outer_cv in np.arange (1, outer_fold_number + 1, 1):
        print (fold_number_in_outer_cv,'/', outer_fold_number)
        # divide training data and test data
        x_train = x [fold_index_in_outer_cv! = fold_number_in_outer_cv,:]. copy ()
        x2_train = x2 [fold_index_in_outer_cv! = fold_number_in_outer_cv,:].copy ()
        y_train = y [fold_index_in_outer_cv! = fold_number_in_outer_cv] .copy ()
        x_test = x [fold_index_in_outer_cv == fold_number_in_outer_cv,:].copy ()
        x2_test = x2 [fold_index_in_outer_cv == fold_number_in_outer_cv,:].copy ()
        # shuffle samples
        if random_state! = -999:
            np.random.seed (0)
        random_numbers = np.random.permutation (np.arange (x_train.shape [0]))
        x_train = x_train [random_numbers,:]
        x2_train = x2_train [random_numbers,:]
        y_train = y_train [random_numbers]
        np.random.seed (0)
        # autoscaling
        if do_autoscaling:
            autoscaled_x_train_pre = (x_train --x_train.mean (axis = 0))/x_train.std (axis = 0, ddof = 1)
            autoscaled_x_train = np.concatenate ([autoscaled_x_train_pre, x2_train],axis = 1)
            autoscaled_y_train = (y_train --y_train.mean ())/y_train.std (ddof = 1)
            autoscaled_x_test_pre = (x_test --x_train.mean (axis = 0))/x_train.std (axis = 0, ddof = 1)
            autoscaled_x_train = np.concatenate ([[autoscaled_x_test_pre],

 [x2_test]],

 axis = 1)
        else: else:
            autoscaled_x_train = x_train.copy ()
            autoscaled_y_train = y_train.copy ()
            autoscaled_x_test = x_test.copy ()
        # inner CV
        gs_cv.fit (autoscaled_x_train, autoscaled_y_train)
        #modeling
        model = getattr (gs_cv,'estimator')
        hyperparameters = list (gs_cv.best_params_.keys ())
        for hyperparameters in hyperparameters:
            setattr (model, hyperparameter, gs_cv.best_params_ [hyperparameter])
        model.fit (autoscaled_x_train, autoscaled_y_train)
        #prediction
        estimated_y_test = np.ndarray.flatten (model.predict (autoscaled_x_test))
        if do_autoscaling:
            estimated_y_test = estimated_y_test * y_train.std (ddof = 1) + y_train.mean ()
        estimated_y [fold_index_in_outer_cv == fold_number_in_outer_cv] = estimated_y_test # store
    return estimated_y
#Settings
inner_fold_number = 10 # "fold_number"-fold cross-validation (CV) for inter CV
outer_fold_number = 54 # "fold_number"-fold CV for outer CV
parameters = {
    "C": [2 ** n for n in range (-5, 11)],

         #Candidates of C
    "epsilon": [2 ** n for n in range (-10, 1)],

    #Candidates of epsilon
    "gamma": [2 ** n for n in range (-20, 11)],

     #Candidates of gamma
}
# DCV
inner_cv = GridSearchCV (svm.SVR (), parameters, scoring ='max_error', n_jobs = -1, cv = inner_fold_number)
y_pred = double_cross_validation (gs_cv = inner_cv, x = x, x2 = x2, y = y, outer_fold_number = outer_fold_number, do_autoscaling = True, random_state = 0)
# Generalization performance from DCV using SVR
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
print (r2_score (y, y_pred))
print (mean_absolute_error (y, y_pred))

error:
ValueError Traceback (most recent call last)
in
99 # DCV
100 inner_cv = GridSearchCV (svm.SVR (), parameters, scoring ='max_error', n_jobs = -1, cv = inner_fold_number)
->101 y_pred = double_cross_validation (gs_cv = inner_cv, x = x, x2 = x2, y = y, outer_fold_number = outer_fold_number, do_autoscaling = True, random_state = 0)
102
103 # Generalization performance from DCV using SVR

in double_cross_validation (gs_cv, x, x2, y, outer_fold_number, do_autoscaling, random_state)
66 autoscaled_y_train = (y_train --y_train.mean ())/y_train.std (ddof = 1)
67 autoscaled_x_test_pre = (x_test --x_train.mean (axis = 0))/x_train.std (axis = 0, ddof = 1)
--->68 autoscaled_x_train = np.concatenate ([[autoscaled_x_test_pre], [x2_test]], axis = 1)
69 else:
70 autoscaled_x_train = x_train.copy ()

ValueError: all the input array dimensions except for the concatenation axis must match exactly
Thank you.

] (76b8eeb2cfb84fe71c8d4025542ffe90.jpeg)

  • Answer # 1

    autoscaled_x_train = np.insert (autoscaled_x_train_pre, x2_train)
    Looking at the error, there are only two arguments for np.insert.
    Since arr, obj, values ​​are required as arguments, but there are only two
    TypeError: insert () missing 1 required positional argument:'values'
    It is said that.
    Please specify the required arguments.

    numpy.insert (arr, obj, values, axis = None)
    numpy.insert

    Postscript
    import numpy as np
    autoscaled_x_train_pre = np.array ([[1.1,1.4], [5.8,6.2]])
    # array ([[1.1, 1.4],
    # [5.8, 6.2]])
    x2_train = np.array ([[0,1], [0,1]])
    # array ([[0, 1],
    # [0, 1]])
    autoscaled_x_train = np.concatenate ([autoscaled_x_train_pre, x2_train], axis = 1)
    # array ([[1.1, 1.4, 0., 1.],
    # [5.8, 6.2, 0., 1.]])