TASK 2 : - BUILDING REGRESSION MODELS TO PREDICT "AMOUNT WITHDRAWN"

evaluate_mae = make_scorer(mean_absolute_error)

# selected_columns_top50 = dataset[selected_columns].corr()[target].dropna().sort_values(ascending = False)[:50].keys().values

X = dataset_fin.copy()
y= dataset[target[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

linear_reg = LinearRegression()
decision_tree_reg = DecisionTreeRegressor()
rf_reg = RandomForestRegressor()
gbm_reg = GradientBoostingRegressor()
xgb_reg = XGBRegressor()
lgb_reg = LGBMRegressor()

Approach 1

model_name = 'linear_reg,decision_tree_reg,rf_reg,gbm_reg,xgb_reg,lgb_reg'
for m in model_name.split(','):
#     try:
    print(m)
    model = eval(m)
    
    model.fit(X_train.values,y_train.values)
    cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
    preds_y = model.predict(X_test.values)
    print(f'Test Results of model {m}')
    print(f'Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')
#     except:
#         pass

linear_reg
Test Results of model linear_reg
Insample:25845.110961501963, Test_MAE :25984.326239575894, RMSE:38645.92650508666
decision_tree_reg
Test Results of model decision_tree_reg
Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
rf_reg
Test Results of model rf_reg
Insample:2502.3302651438826, Test_MAE :2483.5484329535207, RMSE:2931.5459040698574
gbm_reg
Test Results of model gbm_reg
Insample:17035.790857737702, Test_MAE :17363.30906121161, RMSE:23538.787489468144
xgb_reg
Test Results of model xgb_reg
Insample:17003.957644192742, Test_MAE :17360.185416362634, RMSE:23546.678274110127
lgb_reg
Test Results of model lgb_reg
Insample:8827.653202858586, Test_MAE :8802.402242487045, RMSE:11262.491819070334

def plot_importance(feature_importance_graph,n):

    feature_importance_graph['Feature']= X_train.columns.values
    top_n = feature_importance_graph.sort_values('feature_importance',ascending= False)[:n]

    np.random.seed(123)


    plt.rcdefaults()
    plt.figure(1, [20, 20])
    fig, ax = plt.subplots()
    plt.tick_params(axis='both', which='major', labelsize=6)
    features = top_n.index.values
    y_pos = np.arange(len(features))
    importance = top_n['feature_importance']
    error = np.random.rand(len(features))
    ax.barh(y_pos, importance, xerr=error, align='center',
            color='orange', ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(features)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Feature Importance')
    ax.set_title('Variable wise Feature Importance')
    
    display(top_n.T)
    plt.show()

feature_importance_graph =pd.DataFrame(decision_tree_reg.feature_importances_,columns= ['feature_importance'])
feature_importance_graph['features'] = X_train.columns.values
plot_importance(feature_importance_graph,10)

	2	4	3	0	7	1	5	29	6	11
feature_importance	0.671346	0.119714	0.062914	0.0274338	0.0262237	0.0136479	0.0125495	0.0121658	0.0105062	0.00513579
features	Avg Withdrawls Per Hour	Avg No of Withdrawls Per Week	Estimated Number of Houses in 1 KM Radius	Average Wait Time	No of Other ATMs in 1 KM radius	ATM RATING	ATM Since	ATM Zone_RM	Number of Shops Around ATM	ATM looks_Normal
Feature	Avg Withdrawls Per Hour	Avg No of Withdrawls Per Week	Estimated Number of Houses in 1 KM Radius	Average Wait Time	No of Other ATMs in 1 KM radius	ATM RATING	ATM Since	ATM Zone_RM	Number of Shops Around ATM	ATM looks_Normal

<Figure size 2000x2000 with 0 Axes>

Model Optimization

for max_depth in np.arange(1,50):
    model = DecisionTreeRegressor(max_depth =max_depth)
    model.fit(X_train.values,y_train.values)
    cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
    preds_y = model.predict(X_test.values)
    print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')

MaxDepth: 1, Insample:43294.407193843464, Test_MAE :43350.97648974466, RMSE:58753.779358189815
MaxDepth: 2, Insample:34470.48093023932, Test_MAE :34452.15347041783, RMSE:48040.82307556384
MaxDepth: 3, Insample:30730.858600761956, Test_MAE :30785.79328678622, RMSE:43021.15033911037
MaxDepth: 4, Insample:27075.845357110287, Test_MAE :27101.827067065526, RMSE:37404.56304742286
MaxDepth: 5, Insample:24192.29538661906, Test_MAE :24315.710304590277, RMSE:32815.98081585613
MaxDepth: 6, Insample:21489.493379952222, Test_MAE :21671.120765700663, RMSE:29411.762232802153
MaxDepth: 7, Insample:18823.224759668836, Test_MAE :19020.248075196643, RMSE:26492.212086323743
MaxDepth: 8, Insample:16426.381671604126, Test_MAE :16537.507216759834, RMSE:23428.96837179775
MaxDepth: 9, Insample:14015.643262894799, Test_MAE :14090.42279438137, RMSE:20653.748714649097
MaxDepth: 10, Insample:11785.719542173143, Test_MAE :11899.980043085003, RMSE:18086.06736185007
MaxDepth: 11, Insample:9890.587989177347, Test_MAE :10071.586714158171, RMSE:15756.99832871634
MaxDepth: 12, Insample:8159.441827501598, Test_MAE :8329.027560570526, RMSE:12832.546645145418
MaxDepth: 13, Insample:6620.658706836798, Test_MAE :6809.224021242581, RMSE:10746.086380315965
MaxDepth: 14, Insample:5226.356138551383, Test_MAE :5252.251528108063, RMSE:8265.594697467895
MaxDepth: 15, Insample:4250.857669024639, Test_MAE :4236.616719384278, RMSE:6544.435175853031
MaxDepth: 16, Insample:3579.1025968385607, Test_MAE :3526.1082626291777, RMSE:5155.150944273315
MaxDepth: 17, Insample:3101.084115247994, Test_MAE :3058.2062124503377, RMSE:4180.571224590666
MaxDepth: 18, Insample:2813.4490703349584, Test_MAE :2779.639585751901, RMSE:3572.02728052724
MaxDepth: 19, Insample:2652.6172704562205, Test_MAE :2620.4293123885145, RMSE:3229.1686026058674
MaxDepth: 20, Insample:2552.481481627842, Test_MAE :2530.5122563299956, RMSE:3030.0612847501725
MaxDepth: 21, Insample:2512.0205351531895, Test_MAE :2499.657672121048, RMSE:2965.3957891493815
MaxDepth: 22, Insample:2493.375410655453, Test_MAE :2476.1411975987353, RMSE:2917.68309710487
MaxDepth: 23, Insample:2491.4955577328146, Test_MAE :2474.5194750922105, RMSE:2915.085179714678
MaxDepth: 24, Insample:2491.938328256002, Test_MAE :2475.610214083156, RMSE:2917.130001368455
MaxDepth: 25, Insample:2492.8962640399473, Test_MAE :2476.6713625868847, RMSE:2918.686244501055
MaxDepth: 26, Insample:2493.3233993681397, Test_MAE :2477.9480618760854, RMSE:2920.3615772274343
MaxDepth: 27, Insample:2493.8422611012165, Test_MAE :2478.4225517057953, RMSE:2921.2257956363974
MaxDepth: 28, Insample:2494.0505980820217, Test_MAE :2478.1742227314726, RMSE:2921.1703463824874
MaxDepth: 29, Insample:2494.1326428318694, Test_MAE :2477.925285229903, RMSE:2921.0670500577844
MaxDepth: 30, Insample:2494.1560760959433, Test_MAE :2477.9648782987874, RMSE:2921.119651120108
MaxDepth: 31, Insample:2494.2048587315307, Test_MAE :2477.978774094098, RMSE:2921.1370952462084
MaxDepth: 32, Insample:2494.2247491848498, Test_MAE :2477.992857881511, RMSE:2921.1487430926636
MaxDepth: 33, Insample:2494.2267283732717, Test_MAE :2477.9967758321136, RMSE:2921.1513564449738
MaxDepth: 34, Insample:2494.2304658885837, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 35, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 36, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 37, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 38, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 39, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 40, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 41, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 42, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 43, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 44, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 45, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 46, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 47, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 48, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065
MaxDepth: 49, Insample:2494.2298785929625, Test_MAE :2478.0075121784052, RMSE:2921.1636684361065

model = DecisionTreeRegressor(max_depth =23)
model.fit(X_train.values,y_train.values)
cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
preds_y = model.predict(X_test.values)
print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')

MaxDepth: 49, Insample:2491.4955577328146, Test_MAE :2474.5194750922105, RMSE:2915.085179714678

result_df = pd.DataFrame(preds_y,columns =['preds'])
result_df['actuals'] = y_test.values
result_df['pred_bin'] = pd.cut(result_df['preds'],10).cat.codes
result_df['actuals_bin'] = pd.cut(result_df['actuals'],10).cat.codes
pd.crosstab(result_df['actuals_bin'],result_df['pred_bin'])

pred_bin	0	1	2	3	4	5	6	7	8	9
actuals_bin
0	5455	75	0	0	0	0	0	0	0	0
1	518	29381	222	0	0	0	0	0	0	0
2	0	1117	15352	58	0	0	0	0	0	0
3	0	0	314	5636	51	0	0	0	0	0
4	0	0	0	98	2075	33	0	0	0	0
5	0	0	0	0	64	731	0	0	0	0
6	0	0	0	0	0	24	164	0	0	0
7	0	0	0	0	0	0	11	146	4	0
8	0	0	0	0	0	0	0	11	49	0
9	0	0	0	0	0	0	0	0	0	82

MODEL Approach 1 BEST MAE : 2447.519, RMSE : 2915.085

def outlier_data_treatment(df, outlier_headers):

    for col in outlier_headers:
        print (col)
        (q75, q25) = np.percentile(df[col].dropna(), [75, 25])
        iqr = q75 - q25
        upper_val = (q75 + iqr * 1.5).astype(df[col].dtypes)
        print (upper_val)
        lower_val = q25 - (iqr * 1.5).astype(df[col].dtypes)
        print (lower_val)
        df[col] = df[col].apply(lambda x: replace_outlier(x, upper_val,
                                lower_val))
    return df

def replace_outlier(i, uv, lv):
    if i < lv:
        i = lv
    elif i > uv:
        i = uv
    else:
        i
    return i

outlier_treated_data = outlier_data_treatment(dataset,dataset.select_dtypes(np.number))

dataset_cat = pd.get_dummies(outlier_treated_data[exploration_dict['possible_categorical_features']])
dataset_continuos = outlier_treated_data[continuos_columns]
dataset_fin = dataset_continuos.join(dataset_cat)

X = dataset_fin.copy()
y= dataset[target[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = DecisionTreeRegressor(max_depth =23)
model.fit(X_train.values,y_train.values)
cv_results = cross_val_score(model,X_train.values,y_train.values,cv=10,scoring= evaluate_mae)
preds_y = model.predict(X_test.values)
print(f'MaxDepth: {max_depth}, Insample:{cv_results.mean()}, Test_MAE :{mean_absolute_error(preds_y,y_test)}, RMSE:{np.sqrt(mean_squared_error(preds_y,y_test))}')


result_df = pd.DataFrame(preds_y,columns =['preds'])
result_df['actuals'] = y_test.values
result_df['pred_bin'] = pd.cut(result_df['preds'],10).cat.codes
result_df['actuals_bin'] = pd.cut(result_df['actuals'],10).cat.codes
display(pd.crosstab(result_df['actuals_bin'],result_df['pred_bin']))

Number of Shops Around ATM
149
-46.0
No of Other ATMs in 1 KM radius
111.5
27.5
Estimated Number of Houses in 1 KM Radius
17673.5
1481.5
Avg Withdrawls Per Hour
10.0
2.0
Avg No of Withdrawls Per Week
2155
119.0
Average Wait Time
4.5
0.5
ATM RATING
5
2.0
ATM Since
2012
2004.0
AmountWithDrawn
344450
9650.0
MaxDepth: 49, Insample:2377.345644638866, Test_MAE :2362.4068984181645, RMSE:2841.7989787225233

pred_bin	0	1	2	3	4	5	6	7	8	9
actuals_bin
0	627	2	0	0	0	0	0	0	0	0
1	114	2971	58	0	0	0	0	0	0	0
2	0	474	7697	163	0	0	0	0	0	0
3	0	0	1038	14027	154	0	0	0	0	0
4	0	0	0	999	10625	178	0	0	0	0
5	0	0	0	0	660	6757	90	0	0	0
6	0	0	0	0	0	448	4719	103	0	0
7	0	0	0	0	0	0	154	3223	83	0
8	0	0	0	0	0	0	0	203	1572	61
9	0	0	0	0	0	0	0	0	38	4433

sns.boxplot(result_df.actuals)

<matplotlib.axes._subplots.AxesSubplot at 0x7f6ca62d9080>

result_df[['actuals','preds']].hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f6cad972630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f6cad99a7f0>]],
      dtype=object)

Search This Blog

AI4GOOD

Amount Withdrawn Model Part 4

TASK 2 : - BUILDING REGRESSION MODELS TO PREDICT "AMOUNT WITHDRAWN"

Approach 1

Model Optimization

MODEL Approach 1 BEST MAE : 2447.519, RMSE : 2915.085

Comments

Post a Comment

Popular posts from this blog

Hybrid Recommendation Engine

Amount Withdrawn Model Part 1

Amount Withdrawn Model Part 2