In [249]:

# Import dependencies
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import PowerTransformer, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn import svm
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

#import cleaned dataset
df = pd.read_parquet('cleaned_dataset.parquet')

# split df into two 
#dataframe with entries just for the fixed category
df_fixed = df.loc[df['net_type'] == 'Fixed'].copy()
df_fixed.reset_index(drop=True, inplace=True)
df_fixed.drop(columns='net_type', inplace=True)

#dataframe with entries just for the mobile category
df_mob = df.loc[df['net_type'] == 'Mobile'].copy()
df_mob.reset_index(drop=True, inplace=True)
df_mob.drop(columns='net_type', inplace=True)

display(df)
print(df.columns)

# create dataframe dict to iterate through
dfs = {
    'Fixed': df_fixed,
    'Mobile': df_mob,
    # import YeoJ transformed dfs
    'YeoJ_Fix': pd.read_parquet('yeoj_fixed_dataset.parquet'),
    'YeoJ_Mob': pd.read_parquet('yeoj_mob_dataset.parquet')
}

for key in dfs:
    display(dfs[key])
    print(dfs[key].columns)

	avg_d_mbps	avg_u_mbps	avg_lat_ms	avg_lat_down_ms	avg_lat_up_ms	net_type
0	50.073	18.199	40	475	1954	Mobile
1	21.784	0.745	47	1493	2252	Mobile
2	18.159	1.662	21	244	2067	Mobile
3	1.439	0.659	749	2357	5083	Mobile
4	13.498	3.525	37	598	1023	Mobile
...	...	...	...	...	...	...
19025	215.644	114.035	14	384	606	Fixed
19026	48.533	17.553	34	172	43	Fixed
19027	5.732	0.473	52	8039	304	Fixed
19028	116.025	129.465	8	91	219	Fixed
19029	145.911	42.130	15	139	555	Fixed

19030 rows × 6 columns

Index(['avg_d_mbps', 'avg_u_mbps', 'avg_lat_ms', 'avg_lat_down_ms',
       'avg_lat_up_ms', 'net_type'],
      dtype='object')
Index(['avg_d_mbps', 'avg_u_mbps', 'avg_lat_ms', 'avg_lat_down_ms',
       'avg_lat_up_ms'],
      dtype='object')
Index(['avg_d_mbps', 'avg_u_mbps', 'avg_lat_ms', 'avg_lat_down_ms',
       'avg_lat_up_ms'],
      dtype='object')
Index(['avg_d_mbps', 'avg_u_mbps', 'avg_lat_ms', 'avg_lat_down_ms',
       'avg_lat_up_ms'],
      dtype='object')
Index(['avg_d_mbps', 'avg_u_mbps', 'avg_lat_ms', 'avg_lat_down_ms',
       'avg_lat_up_ms'],
      dtype='object')

	avg_d_mbps	avg_u_mbps	avg_lat_ms	avg_lat_down_ms	avg_lat_up_ms
0	104.961	104.419	6	126	94
1	212.782	33.322	26	122	223
2	109.832	9.109	18	211	164
3	194.682	116.727	20	279	93
4	151.912	13.325	19	174	454
...	...	...	...	...	...
9809	215.644	114.035	14	384	606
9810	48.533	17.553	34	172	43
9811	5.732	0.473	52	8039	304
9812	116.025	129.465	8	91	219
9813	145.911	42.130	15	139	555

9814 rows × 5 columns

	avg_d_mbps	avg_u_mbps	avg_lat_ms	avg_lat_down_ms	avg_lat_up_ms
0	50.073	18.199	40	475	1954
1	21.784	0.745	47	1493	2252
2	18.159	1.662	21	244	2067
3	1.439	0.659	749	2357	5083
4	13.498	3.525	37	598	1023
...	...	...	...	...	...
9211	42.572	23.439	22	238	640
9212	15.952	0.256	39	1189	1083
9213	107.443	25.328	24	751	1555
9214	26.593	21.297	36	565	378
9215	23.803	4.061	26	284	1020

9216 rows × 5 columns

	avg_d_mbps	avg_u_mbps	avg_lat_ms	avg_lat_down_ms	avg_lat_up_ms
0	8.972164	4.759714	1.500592	5.364215	4.594474
1	11.504014	3.594225	2.152917	5.325130	5.469041
2	9.121034	2.338349	2.006061	5.997121	5.157019
3	11.159330	4.875059	2.049368	6.346418	4.583702
4	10.238558	2.695041	2.028408	5.758639	6.193776
...	...	...	...	...	...
9809	11.556506	4.850887	1.898827	6.751223	6.489121
9810	6.696179	2.960429	2.252471	5.744407	3.812195
9811	2.457252	0.387996	2.397371	10.894972	5.784466
9812	9.303431	4.982478	1.641398	4.972823	5.450639
9813	10.094487	3.830499	1.928811	5.483558	6.399159

9814 rows × 5 columns

	avg_d_mbps	avg_u_mbps	avg_lat_ms	avg_lat_down_ms	avg_lat_up_ms
0	4.239305	3.351656	1.842729	7.078103	13.227949
1	3.317389	0.569910	1.873058	8.614639	13.627984
2	3.123101	1.020246	1.700411	6.220586	13.385472
3	0.906716	0.517075	2.171259	9.249501	16.077464
4	2.813201	1.608974	1.827349	7.380719	11.498883
...	...	...	...	...	...
9211	4.055669	3.663706	1.711899	6.189053	10.337699
9212	2.986643	0.230117	1.837787	8.302855	11.645122
9213	5.124904	3.761257	1.732857	7.683230	12.600224
9214	3.533581	3.544352	1.821830	7.305826	9.119884
9215	3.413049	1.736568	1.751540	6.413657	11.491378

9216 rows × 5 columns

Univariate

In [250]:

# Univariate Linear regression with and without yeo-j transformation
# Predictor variable
X = dfs['Fixed']['avg_u_mbps'].values.reshape(-1, 1)
# Target variable
y = dfs['Fixed']['avg_d_mbps'].values.reshape(-1, 1)

# Split the data into training and testing sets
X_train_fix, X_test_fix, y_train_fix, y_test_fix = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model_orig = LinearRegression()

# Create a pipeline for the transformed data
pipe_trans = Pipeline([
    ('power_transform', PowerTransformer(method='yeo-johnson')),  # Apply Yeo-Johnson transformation
    ('model', LinearRegression())  # Apply linear regression
])

# Train the models
model_orig.fit(X_train_fix, y_train_fix)
pipe_trans.fit(X_train_fix, y_train_fix)

# Make predictions
y_pred_orig = model_orig.predict(X_test_fix)
y_pred_trans = pipe_trans.predict(X_test_fix)

# Evaluate the models
mae_orig = mean_absolute_error(y_test_fix, y_pred_orig)
mse_orig = mean_squared_error(y_test_fix, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
r2_orig = r2_score(y_test_fix, y_pred_orig)

mae_trans = mean_absolute_error(y_test_fix, y_pred_trans)
mse_trans = mean_squared_error(y_test_fix, y_pred_trans)
rmse_trans = np.sqrt(mse_trans)
r2_trans = r2_score(y_test_fix, y_pred_trans)

# store results in dict
results = { 
    'Original data':{
        'Mean Abs Error': mae_orig,
        'Mean Sq2 Error': mse_orig, 
        'Root Mean Sq2 Error': rmse_orig, 
        'R2': r2_orig
    },
    'Transformed data':{
        'Mean Abs Error': mae_trans,
        'Mean Sq2 Error': mse_trans, 
        'Root Mean Sq2 Error': rmse_trans, 
        'R2': r2_trans
    }
}

# convert dict to dataframe
results_df = pd.DataFrame.from_dict(results)

print(results_df)
# Create a table from the dataframe and display it
fig =  ff.create_table(results_df, index=True)
fig.show()

                     Original data  Transformed data
Mean Abs Error           88.749450         83.580674
Mean Sq2 Error        12988.151119      12465.236420
Root Mean Sq2 Error     113.965570        111.647823
R2                        0.306146          0.334081

In [251]:

# Mulitivariate Linear regression with and without yeo-j transformation
# Predictor variables
X = dfs['Fixed'].drop(columns='avg_d_mbps', axis=1)
# Target variable 
y = dfs['Fixed']['avg_d_mbps'].values.reshape(-1, 1)

# Split the data into training and testing sets
X_train_fix, X_test_fix, y_train_fix, y_test_fix = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model for originl data
model_orig = LinearRegression()

# Create a pipeline for the transformed data
pipe_trans = Pipeline([
    #('scaler', StandardScaler()),
    # Apply Yeo-Johnson transformation
    ('power_transform', PowerTransformer(method='yeo-johnson')), 
    # Apply linear regression
    ('model', LinearRegression())  
])

# Train the models
model_orig.fit(X_train_fix, y_train_fix)
pipe_trans.fit(X_train_fix, y_train_fix)

# Make predictions
y_pred_orig = model_orig.predict(X_test_fix)
y_pred_trans = pipe_trans.predict(X_test_fix)

# Evaluate the models
mae_orig = mean_absolute_error(y_test_fix, y_pred_orig)
mse_orig = mean_squared_error(y_test_fix, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
r2_orig = r2_score(y_test_fix, y_pred_orig)

mae_trans = mean_absolute_error(y_test_fix, y_pred_trans)
mse_trans = mean_squared_error(y_test_fix, y_pred_trans)
rmse_trans = np.sqrt(mse_trans)
r2_trans = r2_score(y_test_fix, y_pred_trans)

# store results in dict
results = { 
    'Original data':{
        'Mean Abs Error': mae_orig,
        'Mean Sq2 Error': mse_orig, 
        'Root Mean Sq2 Error': rmse_orig, 
        'R2': r2_orig
    },
    'Transformed data':{
        'Mean Abs Error': mae_trans,
        'Mean Sq2 Error': mse_trans, 
        'Root Mean Sq2 Error': rmse_trans, 
        'R2': r2_trans
    }
}

# convert dict to dataframe
results_df = pd.DataFrame.from_dict(results)

print(results_df)
# Create a table from the DataFrame and display it
fig =  ff.create_table(results_df, index=True)
fig.show()

                     Original data  Transformed data
Mean Abs Error           84.074094         78.798233
Mean Sq2 Error        11856.275510      10688.059009
Root Mean Sq2 Error     108.886526        103.383069
R2                        0.366613          0.429022

Gradient Boosting Machine

Multivariate

In [252]:

# Predictor variables
X = dfs['Fixed'].drop(columns='avg_d_mbps', axis=1)
# Target variable
y = dfs['Fixed']['avg_d_mbps'].values  

# Split the data into training and testing sets
X_train_fix, X_test_fix, y_train_fix, y_test_fix = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a XGBoost model
model = GradientBoostingRegressor()

# Train the model
model.fit(X_train_fix, y_train_fix)

# Make predictions
y_pred_orig = model.predict(X_test_fix)


# Evaluate the models
mae_orig = mean_absolute_error(y_test_fix, y_pred_orig)
mse_orig = mean_squared_error(y_test_fix, y_pred_orig)
rmse_orig = np.sqrt(mse_orig)
r2_orig = r2_score(y_test_fix, y_pred_orig)

# store results in dict
results = { 
    'Original data':{
        'Mean Abs Error': mae_orig,
        'Mean Sq2 Error': mse_orig, 
        'Root Mean Sq2 Error': rmse_orig, 
        'R2': r2_orig
    }
}

#convert dict to dataframe
results_df = pd.DataFrame.from_dict(results)

print(results_df)
# Create a table from the DataFrame and display it
fig =  ff.create_table(results_df, index=True)
fig.show()

                     Original data
Mean Abs Error           66.398040
Mean Sq2 Error         8591.947001
R2                        0.541000
Root Mean Sq2 Error      92.692756

Classification

In [253]:

# Create training and testing sets for the classification models

# Define the target variable and features
target = 'net_type'
features = df.drop(columns=target).columns

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

Support Vector Machine Models

In [254]:

Code

#Support Vector Machine model
model = svm.SVC()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
plt.show()

# Evaluate the model
print(classification_report(y_test, y_pred))

Figure 1: SVM original data confusion matrix

              precision    recall  f1-score   support

       Fixed       0.84      0.80      0.82      1968
      Mobile       0.79      0.83      0.81      1838

    accuracy                           0.82      3806
   macro avg       0.82      0.82      0.82      3806
weighted avg       0.82      0.82      0.82      3806

In [255]:

Code

# Support Vector Machine model with yeo-j transform

# Create a pipeline for the transformed data
pipe_trans = Pipeline([
    # Apply Yeo-Johnson transformation
    ('power_transform', PowerTransformer(method='yeo-johnson')), 
    # Create a SVM model
    ('model', svm.SVC())  
])

# Train the model
pipe_trans.fit(X_train, y_train)

# Make predictions
y_pred = pipe_trans.predict(X_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=pipe_trans.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe_trans.classes_)
disp.plot()
plt.show()

# Evaluate the model
print(classification_report(y_test, y_pred))

Figure 2: SVM transformed data confusion matrix

              precision    recall  f1-score   support

       Fixed       0.92      0.83      0.87      1968
      Mobile       0.84      0.92      0.88      1838

    accuracy                           0.87      3806
   macro avg       0.88      0.88      0.87      3806
weighted avg       0.88      0.87      0.87      3806

Random Forest Classifier Models

In [256]:

Code

# Random Forest Classifier Model

# Create a pipeline for the transformed data
pipe_trans = Pipeline([
    # Apply Random Forest
    ('model', RandomForestClassifier(n_estimators=300))
])

# Train the model
pipe_trans.fit(X_train, y_train)

# Make predictions
y_pred = pipe_trans.predict(X_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=pipe_trans.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe_trans.classes_)
disp.plot()
plt.show()

# Evaluate the model
print(classification_report(y_test, y_pred))

Figure 3: Random forest confusion matrix

              precision    recall  f1-score   support

       Fixed       0.90      0.85      0.87      1968
      Mobile       0.84      0.90      0.87      1838

    accuracy                           0.87      3806
   macro avg       0.87      0.87      0.87      3806
weighted avg       0.87      0.87      0.87      3806

In [258]:

# Perform grid search to find more optimal hyperparameters for 
# The Random Forest Classifier Model

# Define the parameter grid
param_grid = {
    'model__max_depth': [None, 5, 10, 15],
    'model__max_leaf_nodes': [None, 5, 10, 15],
    'model__min_samples_leaf': [1, 2, 4],
    'model__min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipe_trans, param_grid, cv=2, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 2 folds for each of 144 candidates, totalling 288 fits
{'model__max_depth': None, 'model__max_leaf_nodes': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5}
0.8707304256437205

In [259]:

Code

# Try Random Forest classifier with better parameters
# Create a pipeline for the transformed data
pipe_trans = Pipeline([
    # Apply Random Forest
    ('model', RandomForestClassifier(n_estimators=300, max_depth=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=5,  ))  
])

# Train the model
pipe_trans.fit(X_train, y_train)

# Make predictions
y_pred = pipe_trans.predict(X_test)

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=pipe_trans.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=pipe_trans.classes_)
disp.plot()
plt.show()

# Evaluate the model
print(classification_report(y_test, y_pred))

Figure 4: Random forest confusion matrix trained with more optimal parameters

              precision    recall  f1-score   support

       Fixed       0.90      0.84      0.87      1968
      Mobile       0.84      0.90      0.87      1838

    accuracy                           0.87      3806
   macro avg       0.87      0.87      0.87      3806
weighted avg       0.88      0.87      0.87      3806

ML_Models.ipynb

Linear Regression

Gradient Boosting Machine

Multivariate

Classification

Support Vector Machine Models

Random Forest Classifier Models