Home>

Advice on how to read the data from 2020/1/20 to 2020/6/30 in the form of "input the past 20 days' data and output the future 20 days' data" I would appreciate it if you could.
(Example: Input the data from 2020/1/20 to 2020/2/8 and output the data from 2020/2/9 to 2020/2/28)

Details of the data: Data for new corona infected persons in Korea from January 20, 2020 to June 30, 2020.
Information such as date of infection, sex, age, place of residence, etc. is included.
We would like to input this information into the regression model of XGBOOST and output the predicted number of infected people.

Applicable source code

I will omit the preprocessing part of the data.

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.externals import joblib
df = pd.read_csv("data.csv")
#print(df.head)
# Divide data into features and objective variables
df_x = df
df_y = df[['Number of infected people']]
#print(df_x.head())
#print(df_y.head())
# xgboost model creation
reg = xgb.XGBRegressor()
# Hyper parameter search
reg_cv = GridSearchCV(reg, {'learning_rate':[0.01,0.03,0.05,0.1,0.3],'min_child_weight': [1,2,3,4,5],'max_depth': [2,4,6, 8,10],'n_estimators': [50,100,200,300,400,500]}, verbose=1)
train_x = df_x[5:29]# I don't know how to fix this part
train_y = df_y[29:53]# I don't know how to fix this part
reg_cv.fit(train_x, train_y)
print(reg_cv.best_params_, reg_cv.best_score_)
# Learn again with optimal parameters
reg = xgb.XGBRegressor(**reg_cv.best_params_)
reg.fit(train_x, train_y)
What I tried

This siteI tried to preprocess the data with reference to, but could not be applied successfully to this case.

  • Answer # 1

    I'm not sure if you understand what you want to do, but here's an example of getting past and future dataframes based on a specified date.

    import pandas as pd
    from datetime import timedelta
    # test data
    dr = pd.date_range(start='2020-01-20', end='2020-06-30')
    df = pd.DataFrame({'date':dr,'val':[i+1 for i in range(len(dr))]})
    DAYS = 3
    BASE = '2020-02-09'
    base = pd.to_datetime([BASE])[0]
    pas_ed = base-timedelta(days=1)
    pas_st = pas_ed-timedelta(days=DAYS-1)
    fet_st = base
    fet_ed = base + timedelta(days=DAYS-1)
    df_pas = df[ (df['date'] >= pas_st)&(df['date']<= pas_ed)]
    df_fet = df[ (df['date'] >= fet_st)&(df['date']<= fet_ed)]
    print(df_pas)
    # date val
    #17 2020-02-06 18
    #18 2020-02-07 19
    #19 2020-02-08 20
    print(df_fet)
    # date val
    #20 2020-02-09 21
    #21 2020-02-10 22
    #22 2020-02-11 23

  • Answer # 2

    #Counter initialization
    import pandas as pd
    import numpy as np
    import xgboost as xgb
    from sklearn.model_selection import GridSearchCV
    from sklearn.metrics import mean_squared_error
    import matplotlib.pyplot as plt
    from sklearn.externals import joblib
    df = pd.read_csv("data.csv")
    #print(df.head)
    # Divide data into features and objective variables
    df_x = df
    df_y = df[['Number of infected people']]
    counter = 1
    # Start line number of learning data
    train_x_start = 0
    train_y_start = 20
    # End line number of learning data
    train_x_end = 20
    train_y_end = 40
    # Number of data to slide
    slide_rows = 1
    while True:
        Extract training data from original data by specifying #line number
        train_x = df_x[train_x_start: train_x_end]
        train_y = df_y[train_y_start: train_y_end]
        print('train:' + str(counter))
        print(train_x)
        print('')
        # xgboost model creation
        reg = xgb.XGBRegressor()
        #Hyperparameter
        params={'learning_rate': 0.1,
            'objective':'reg:squarederror',
            'tree_method':'auto',
            'gamma': 1,
            'reg_alpha': 1,
            'verbose': 1,
            'max_depth': 6,
            'min_child_weight': 1,
            # If you can't expect to improve even after turning a certain number of rounds, stop learning
            'early_stopping_rounds':10
            }
        reg = xgb.XGBRegressor(**params)
        reg.fit(train_x, train_y)
        # Slide data
        train_x_start += slide_rows
        train_y_start += slide_rows
        train_x_end += slide_rows
        train_y_end += slide_rows
        print ('current train_x_start is', train_x_start)
        #Counter addition
        counter += 1
        if train_x_start >83:
          break
    # Evaluation of learning model
    pred_train = reg.predict(df_x[83:103])
    print('RMSE for pred_train:', np.sqrt(mean_squared_error(train_y, pred_train)))
    # Save model
    filename ='Corona_Korea_0630_model1_Provonce No.1.sav'
    joblib.dump(reg, filename)
    # feature importance plot
    #importances = pd.Series(reg.feature_importances_, index = boston.feature_names)
    #importances = importances.sort_values()
    #importances.plot(kind = "barh")
    xgb.plot_importance(reg)
    plt.title("importance in the xgboost Model")
    plt.show()