Amount Withdrawn Model Part 4

TASK 2 : - BUILDING REGRESSION MODELS TO PREDICT "AMOUNT WITHDRAWN"

evaluate_mae = make_scorer(mean_absolute_error)
# selected_columns_top50 = dataset[selected_columns].corr()[target].dropna().sort_values(ascending = False)[:50].keys().values

X = dataset_fin.copy()
y= dataset[target[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
gbm_reg = GradientBoostingRegressor()
xgb_reg = XGBRegressor()
lgb_reg = LGBMRegressor()

Approach 1

model_name = 'linear_reg,decision_tree_reg,rf_reg,gbm_reg,xgb_reg,lgb_reg'
for m in model_name.split(','):
#     try:
    print(m)
    model = eval(m)
    
    model.fit(X_train.values,y_train.values)
    cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
    preds_y = model.predict(X_test.values)
    print(f'Test Results of model {m}')
    print(f'Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')
#     except:
#         pass
linear_reg
Test Results of model linear_reg
Insample:25845.110961501963, Test_MAE :25984.326239575894, RMSE:38645.92650508666
decision_tree_reg
Test Results of model decision_tree_reg
Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
rf_reg
Test Results of model rf_reg
Insample:2502.3302651438826, Test_MAE :2483.5484329535207, RMSE:2931.5459040698574
gbm_reg
Test Results of model gbm_reg
Insample:17035.790857737702, Test_MAE :17363.30906121161, RMSE:23538.787489468144
xgb_reg
Test Results of model xgb_reg
Insample:17003.957644192742, Test_MAE :17360.185416362634, RMSE:23546.678274110127
lgb_reg
Test Results of model lgb_reg
Insample:8827.653202858586, Test_MAE :8802.402242487045, RMSE:11262.491819070334
def plot_importance(feature_importance_graph,n):

    feature_importance_graph['Feature']= X_train.columns.values
    top_n = feature_importance_graph.sort_values('feature_importance',ascending= False)[:n]

    np.random.seed(123)


    plt.rcdefaults()
    plt.figure(1, [20, 20])
    fig, ax = plt.subplots()
    plt.tick_params(axis='both', which='major', labelsize=6)
    features = top_n.index.values
    y_pos = np.arange(len(features))
    importance = top_n['feature_importance']
    error = np.random.rand(len(features))
    ax.barh(y_pos, importance, xerr=error, align='center',
            color='orange', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(features)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Feature Importance')
    ax.set_title('Variable wise Feature Importance')
    
    display(top_n.T)
    plt.show()
    
feature_importance_graph =pd.DataFrame(decision_tree_reg.feature_importances_,columns= ['feature_importance'])
feature_importance_graph['features'] = X_train.columns.values
plot_importance(feature_importance_graph,10)

2 4 3 0 7 1 5 29 6 11
feature_importance 0.671346 0.119714 0.062914 0.0274338 0.0262237 0.0136479 0.0125495 0.0121658 0.0105062 0.00513579
features Avg Withdrawls Per Hour Avg No of Withdrawls Per Week Estimated Number of Houses in 1 KM Radius Average Wait Time No of Other ATMs in 1 KM radius ATM RATING ATM Since ATM Zone_RM Number of Shops Around ATM ATM looks_Normal
Feature Avg Withdrawls Per Hour Avg No of Withdrawls Per Week Estimated Number of Houses in 1 KM Radius Average Wait Time No of Other ATMs in 1 KM radius ATM RATING ATM Since ATM Zone_RM Number of Shops Around ATM ATM looks_Normal
<Figure size 2000x2000 with 0 Axes>

Model Optimization

for max_depth in np.arange(1,50):
    model = DecisionTreeRegressor(max_depth =max_depth)
    model.fit(X_train.values,y_train.values)
    cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
    preds_y = model.predict(X_test.values)
    print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')
MaxDepth: 1, Insample:43294.407193843464, Test_MAE :43350.97648974466, RMSE:58753.779358189815
MaxDepth: 2, Insample:34470.48093023932, Test_MAE :34452.15347041783, RMSE:48040.82307556384
MaxDepth: 3, Insample:30730.858600761956, Test_MAE :30785.79328678622, RMSE:43021.15033911037
MaxDepth: 4, Insample:27075.845357110287, Test_MAE :27101.827067065526, RMSE:37404.56304742286
MaxDepth: 5, Insample:24192.29538661906, Test_MAE :24315.710304590277, RMSE:32815.98081585613
MaxDepth: 6, Insample:21489.493379952222, Test_MAE :21671.120765700663, RMSE:29411.762232802153
MaxDepth: 7, Insample:18823.224759668836, Test_MAE :19020.248075196643, RMSE:26492.212086323743
MaxDepth: 8, Insample:16426.381671604126, Test_MAE :16537.507216759834, RMSE:23428.96837179775
MaxDepth: 9, Insample:14015.643262894799, Test_MAE :14090.42279438137, RMSE:20653.748714649097
MaxDepth: 10, Insample:11785.719542173143, Test_MAE :11899.980043085003, RMSE:18086.06736185007
MaxDepth: 11, Insample:9890.587989177347, Test_MAE :10071.586714158171, RMSE:15756.99832871634
MaxDepth: 12, Insample:8159.441827501598, Test_MAE :8329.027560570526, RMSE:12832.546645145418
MaxDepth: 13, Insample:6620.658706836798, Test_MAE :6809.224021242581, RMSE:10746.086380315965
MaxDepth: 14, Insample:5226.356138551383, Test_MAE :5252.251528108063, RMSE:8265.594697467895
MaxDepth: 15, Insample:4250.857669024639, Test_MAE :4236.616719384278, RMSE:6544.435175853031
MaxDepth: 16, Insample:3579.1025968385607, Test_MAE :3526.1082626291777, RMSE:5155.150944273315
MaxDepth: 17, Insample:3101.084115247994, Test_MAE :3058.2062124503377, RMSE:4180.571224590666
MaxDepth: 18, Insample:2813.4490703349584, Test_MAE :2779.639585751901, RMSE:3572.02728052724
MaxDepth: 19, Insample:2652.6172704562205, Test_MAE :2620.4293123885145, RMSE:3229.1686026058674
MaxDepth: 20, Insample:2552.481481627842, Test_MAE :2530.5122563299956, RMSE:3030.0612847501725
MaxDepth: 21, Insample:2512.0205351531895, Test_MAE :2499.657672121048, RMSE:2965.3957891493815
MaxDepth: 22, Insample:2493.375410655453, Test_MAE :2476.1411975987353, RMSE:2917.68309710487
MaxDepth: 23, Insample:2491.4955577328146, Test_MAE :2474.5194750922105, RMSE:2915.085179714678
MaxDepth: 24, Insample:2491.938328256002, Test_MAE :2475.610214083156, RMSE:2917.130001368455
MaxDepth: 25, Insample:2492.8962640399473, Test_MAE :2476.6713625868847, RMSE:2918.686244501055
MaxDepth: 26, Insample:2493.3233993681397, Test_MAE :2477.9480618760854, RMSE:2920.3615772274343
MaxDepth: 27, Insample:2493.8422611012165, Test_MAE :2478.4225517057953, RMSE:2921.2257956363974
MaxDepth: 28, Insample:2494.0505980820217, Test_MAE :2478.1742227314726, RMSE:2921.1703463824874
MaxDepth: 29, Insample:2494.1326428318694, Test_MAE :2477.925285229903, RMSE:2921.0670500577844
MaxDepth: 30, Insample:2494.1560760959433, Test_MAE :2477.9648782987874, RMSE:2921.119651120108
MaxDepth: 31, Insample:2494.2048587315307, Test_MAE :2477.978774094098, RMSE:2921.1370952462084
MaxDepth: 32, Insample:2494.2247491848498, Test_MAE :2477.992857881511, RMSE:2921.1487430926636
MaxDepth: 33, Insample:2494.2267283732717, Test_MAE :2477.9967758321136, RMSE:2921.1513564449738
MaxDepth: 34, Insample:2494.2304658885837, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 35, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 36, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 37, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 38, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 39, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 40, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 41, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 42, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 43, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 44, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 45, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 46, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 47, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 48, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 49, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
model = DecisionTreeRegressor(max_depth =23)
model.fit(X_train.values,y_train.values)
cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
preds_y = model.predict(X_test.values)
print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')
MaxDepth: 49, Insample:2491.4955577328146, Test_MAE :2474.5194750922105, RMSE:2915.085179714678
result_df = pd.DataFrame(preds_y,columns =['preds'])
result_df['actuals'] = y_test.values
result_df['pred_bin'] = pd.cut(result_df['preds'],10).cat.codes
result_df['actuals_bin'] = pd.cut(result_df['actuals'],10).cat.codes
pd.crosstab(result_df['actuals_bin'],result_df['pred_bin'])
pred_bin 0 1 2 3 4 5 6 7 8 9
actuals_bin









0 5455 75 0 0 0 0 0 0 0 0
1 518 29381 222 0 0 0 0 0 0 0
2 0 1117 15352 58 0 0 0 0 0 0
3 0 0 314 5636 51 0 0 0 0 0
4 0 0 0 98 2075 33 0 0 0 0
5 0 0 0 0 64 731 0 0 0 0
6 0 0 0 0 0 24 164 0 0 0
7 0 0 0 0 0 0 11 146 4 0
8 0 0 0 0 0 0 0 11 49 0
9 0 0 0 0 0 0 0 0 0 82

MODEL Approach 1 BEST MAE : 2447.519, RMSE : 2915.085

def outlier_data_treatment(df, outlier_headers):

    for col in outlier_headers:
        print (col)
        (q75, q25) = np.percentile(df[col].dropna(), [75, 25])
        iqr = q75 - q25
        upper_val = (q75 + iqr * 1.5).astype(df[col].dtypes)
        print (upper_val)
        lower_val = q25 - (iqr * 1.5).astype(df[col].dtypes)
        print (lower_val)
        df[col] = df[col].apply(lambda x: replace_outlier(x, upper_val,
                                lower_val))
    return df

def replace_outlier(i, uv, lv):
    if i < lv:
        i = lv
    elif i > uv:
        i = uv
    else:
        i
    return i
outlier_treated_data = outlier_data_treatment(dataset,dataset.select_dtypes(np.number))

dataset_cat = pd.get_dummies(outlier_treated_data[exploration_dict['possible_categorical_features']])
dataset_continuos = outlier_treated_data[continuos_columns]
dataset_fin = dataset_continuos.join(dataset_cat)

X = dataset_fin.copy()
y= dataset[target[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = DecisionTreeRegressor(max_depth =23)
model.fit(X_train.values,y_train.values)
cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
preds_y = model.predict(X_test.values)
print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')


result_df = pd.DataFrame(preds_y,columns =['preds'])
result_df['actuals'] = y_test.values
result_df['pred_bin'] = pd.cut(result_df['preds'],10).cat.codes
result_df['actuals_bin'] = pd.cut(result_df['actuals'],10).cat.codes
display(pd.crosstab(result_df['actuals_bin'],result_df['pred_bin']))
Number of Shops Around ATM
149
-46.0
No of Other ATMs in 1 KM radius
111.5
27.5
Estimated Number of Houses in 1 KM Radius
17673.5
1481.5
Avg Withdrawls Per Hour
10.0
2.0
Avg No of Withdrawls Per Week
2155
119.0
Average Wait Time
4.5
0.5
ATM RATING
5
2.0
ATM Since
2012
2004.0
AmountWithDrawn
344450
9650.0
MaxDepth: 49, Insample:2377.345644638866, Test_MAE :2362.4068984181645, RMSE:2841.7989787225233
pred_bin 0 1 2 3 4 5 6 7 8 9
actuals_bin









0 627 2 0 0 0 0 0 0 0 0
1 114 2971 58 0 0 0 0 0 0 0
2 0 474 7697 163 0 0 0 0 0 0
3 0 0 1038 14027 154 0 0 0 0 0
4 0 0 0 999 10625 178 0 0 0 0
5 0 0 0 0 660 6757 90 0 0 0
6 0 0 0 0 0 448 4719 103 0 0
7 0 0 0 0 0 0 154 3223 83 0
8 0 0 0 0 0 0 0 203 1572 61
9 0 0 0 0 0 0 0 0 38 4433
sns.boxplot(result_df.actuals)
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ca62d9080>
result_df[['actuals','preds']].hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f6cad972630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f6cad99a7f0>]],
      dtype=object)

Comments

Popular posts from this blog

Amount Withdrawn Model Part 2

Hybrid Recommendation Engine

Amount Withdrawn Model Part 1