%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)
import ads
import os
from ads.dataset.factory import DatasetFactory
from ads.dataset.dataset_browser import DatasetBrowser
import ads.environment.ml_runtime
from ads.common.model import ADSModel
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.catalog.model import ModelSummaryList, ModelCatalog
from ads.catalog.project import ProjectSummaryList, ProjectCatalog
from ads.catalog.summary import SummaryList
from ads.common.model_artifact import ModelArtifact
ads.set_debug_mode(True)
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
import ads.environment.ml_runtime
import numpy as np
import pandas as pd
import sys
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.font_manager
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from pyod.models.iforest import IForest
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from pyod.models.knn import KNN
In order to get the data from object storage we need to follow the steps below:
Go to OCI Console.
a. Top right icon for profile. Click on "oracleidentitycloudservice/
b. Go to API key below resources
c. Create a new API key if you dont have one.
-select the Generate API Key Pair radio button.
-Copy the contents of the Configuration File Preview.
-Download private key
-Click Add
Come back to the Data Science platform, open terminal.
a. Inside /home/datascience directory create a .oci directory (mkdir .oci)
b. In .oci, create a config file (touch config) and a private_key.pem (touch private_key.pem)
(For the following steps use a text editor for Unix, we used vi - visual editor)
Copy the content of the Configuration File Preview and paste it into this config file inside .oci. Replace "< path to your private keyfile > #TODO" with /home/datascience/.oci/private_key.pem
Use ADS Datafactory in your notebook to get your data. The code is provided below.
#information such as file_name, name_space and bucket_name will be availale in Bucket Details. (Object Storage > Bucket Details)
ads.set_auth(auth='resource_principal')
bucket_name = 'Fraud_data'
file_name = 'fraud_creditcard.csv'
name_space = 'idhkis4m3p5e'
storage_options = {'config': {'tenancy': os.environ['TENANCY_OCID'], 'region': os.environ['NB_REGION']}}
# creditcard.csv is loaded from object storage
ds = DatasetFactory.open(f"ocis://{bucket_name}/{file_name}", storage_options= {
"config": "~/.oci/config",
"profile": "DEFAULT"},
target = "Class",
type_discovery = False,
types = {'Class': 'category'})
Initializing:0.000s Opening data:16.718s Generating data sample:6.195s Building dataset:0.486s
#checking for null values using seaborn package on ADS dataset
sns.heatmap(ds.isnull(), cbar=False)
<AxesSubplot:>
#To get a quick overview of all the column types and how the column’s values are distributed:
ds.show_in_notebook()
#visualizations of time and amount
plt.figure(figsize=(10,8))
plt.title('Distribution of Time Feature')
sns.distplot(ds.Time)
<AxesSubplot:title={'center':'Distribution of Time Feature'}, xlabel='Time', ylabel='Density'>
#To explore features, use the smart plot() method. It accepts one or two feature names.
#The show_in_notebook() method automatically determines the best type of plot based on the type of features that are to be plotted.
ds.plot("Class").show_in_notebook(figsize=(4,4))
ds['Class'].value_counts().compute()
0 284315 1 492 Name: Class, dtype: int64
ds.info()
<class 'dask.dataframe.core.DataFrame'> Columns: 31 entries, Time to Class dtypes: float64(31)
#There are function to change datasets from Pandas DataFrame to ADS dataset and viceversa
data = ds.to_pandas_dataframe()
from imblearn.combine import SMOTETomek
#Create independent and Dependent Features
columns = data.columns.tolist()
# Filter the columns to remove data we do not want
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting
target = "Class"
# Define a random state
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)
(284807, 30) (284807,)
# Implementing Oversampling for Handling Imbalanced using SMOTE-Tomek Links
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,Y)
X_res.shape,y_res.shape
((567562, 30), (567562,))
balanced_data = X_res
balanced_data["Class"] = y_res
balanced_data.head()
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
| 1 | 0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
| 2 | 1 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
| 3 | 1 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
| 4 | 2 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))
Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 283781, 1: 283781})
count_classes = balanced_data.value_counts(balanced_data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xlabel("Class")
plt.ylabel("Frequency")
Text(0, 0.5, 'Frequency')
#we convert the Pandas Dataframe back to ADS Dataset
balanced_ds = DatasetFactory.open(balanced_data,
target = "Class",
type_discovery = False,
types = {'Class': 'category'})
Initializing:0.000s Opening data:0.167s Generating data sample:0.129s Building dataset:0.448s
# view summary of balanced data, notice that exactly 50% of the "Class" attribute has zeros, the remaining 50% has ones
balanced_ds.show_in_notebook()
balanced_ds.info()
<class 'dask.dataframe.core.DataFrame'> Columns: 31 entries, Time to Class dtypes: float64(29), int32(1), int8(1)
The balanced dataset is then split into a train, and test sets. The train set will be used in the AutoML feature to train several different ML models.
# the dataset is split into train, test
train, test = balanced_ds.train_test_split(test_size=0.2)
# AutoML is used to automate algorithm selection
# different models are trained to solve the classification problem by using Oracle AutoML
ml_engine = OracleAutoMLProvider(n_jobs=-1, loglevel=logging.ERROR)
automl = AutoML(train, provider=ml_engine)
model, baseline = automl.train(model_list=[
'LogisticRegression',
'LGBMClassifier',
'XGBClassifier',
'RandomForestClassifier'], time_budget=10)
Below we show the performance of the different algorithms that were trained and the results of each stage of the AutoML pipeline
# print different trials that were run during the AutoML training period
automl.print_trials(max_rows=20, sort_column='Mean Validation Score')
| Algorithm | #Samples | #Features | Mean Validation Score | Hyperparameters | CPU Time |
|---|---|---|---|---|---|
| LGBMClassifier_HT | 453930 | 30 | -0.0041 | {'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0, 'reg_lambda': 1} | 152.0992 |
| LGBMClassifier_AS | 22696 | 30 | -0.0069 | {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0, 'reg_lambda': 1, 'class_weight': 'balanced'} | 12.6054 |
| XGBClassifier_AS | 22696 | 30 | -0.0384 | {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1} | 46.9292 |
| RandomForestClassifier_AS | 22696 | 30 | -0.0417 | {'n_estimators': 100, 'class_weight': 'balanced', 'max_features': 0.777777778, 'min_samples_leaf': 0.000625, 'min_samples_split': 0.00125} | 359.4325 |
| LogisticRegression_AS | 22696 | 30 | -0.0621 | {'C': 1.0, 'class_weight': 'balanced', 'solver': 'liblinear'} | 6.1619 |
# View algorithm performance
#the closer to
automl.visualize_algorithm_selection_trials()
# View hyperparameter tuning trials by AutoML
automl.visualize_tuning_trials()
# the accuracy of the model checked using the test data
accuracy = sum(test.y == model.predict(test.X)) / len(test.y) # Check accuracy of model
print("The accuracy of the model is:", accuracy)
The accuracy of the model is: 0.9994631793860884
test_arr_x = np.asarray(test.X)
test_arr_y = np.asarray(test.y)
len(test_arr_y)
113632
train_arr_x = np.asarray(train.X)
train_arr_y = np.asarray(train.y)
len(train_arr_y)
453930
predict_test = np.asarray(model.predict(test.X))
num_fraud = 0
correct_prediction = 0
for i in range(len(test_arr_y)):
if test_arr_y[i] == 1:
num_fraud +=1
if test_arr_y[i] == predict_test[i]:
correct_prediction +=1
print(correct_prediction)
print(num_fraud)
correct_prediction/num_fraud
56749 56751
0.9999647583302497
ADS provides a thorough model evaluation and explanation API through the ADSEvaluator and ADSExplainer object. One can look at global and local evaluation and explanation with ADS. Some demonstrations are shown below
evaluator = ADSEvaluator(test, models=[model])
evaluator.show_in_notebook()
# our model explainer class
explainer = ADSExplainer(test, model)
# let's created a global explainer
global_explainer = explainer.global_explanation(provider=MLXGlobalExplainer())
# Compute importance values and display top 20 in the notebook:
importances = global_explainer.compute_feature_importance()
importances.show_in_notebook(n_features=20)
Oracle Model Catalog is an Oracle data science service that allows you to save you model in a ready to deploy state in the OCI console. To save you model in the catalog, you first need to prepare your model into a model artifact. A model artifact is your running model, including any dependencies of your model stored within a directory.
# We prepare the model artifact
model_artifact = model.prepare("/home/datascience/fraud_model_balanced/",
force_overwrite=True,
data_sample=test,
include_data_sample=True,
data_science_env=True)
Initializing:0.000s Preparing Model Artifact Directory:0.007s Serializing model:5.656s Generating schema:13.492s Updating requirements.txt:0.024s Creating runtime.yaml configuration:0.135s Writing func.yaml:0.005s Writing func.py:0.015s
# This shows all the files that exist within the model artifact directory
!ls /home/datascience/fraud_model_balanced/
data-sample.json model.onnx requirements.txt score.py func.py onnx_data_transformer.json runtime.yaml func.yaml __pycache__ schema.json
# Before the model can be saved we need to know our compartment id and project id
compartment_id = os.environ["NB_SESSION_COMPARTMENT_OCID"]
project_id = os.environ["PROJECT_OCID"]
# save model artifact to model catalog
model_artifact.save(project_id=project_id,
compartment_id=compartment_id,
display_name="fraud_model_balanced",
description="Fraud detection model built with AutoML with balanced dataset",
training_script_path="fraud_model_deployment.ipynb",
ignore_pending_changes=True)
Initializing:0.001s Creating model in catalog:0.344s Generating model artifact zip:0.329s Uploading model artifact:18.345s Save provenance metadata:0.317s
| id | ocid1.datasciencemodel.oc1.iad.amaaaaaatwfhi7ya3x5x4icfmr75uxqhsillo4j67ejoon5dwu37xzav3jmq |
|---|---|
| compartment_id | ocid1.compartment.oc1..aaaaaaaa4yvgb3xrr7idbzkzaisrdkwiimwmpemya7iaehqugcdv4egvpxaa |
| project_id | ocid1.datascienceproject.oc1.iad.amaaaaaatwfhi7yakzcke2uuehttf322fpserpjyphzwft27la5bktxkfzcq |
| display_name | fraud_model_balanced |
| description | Fraud detection model built with AutoML with balanced dataset |
| lifecycle_state | ACTIVE |
| time_created | 2021-08-23 00:19:52.619000+00:00 |
| created_by | ocid1.datasciencenotebooksession.oc1.iad.amaaaaaatwfhi7yae7nxvcmggfllcb25gb2xdb3l3dnm7lqudwx2ssxgawla |
| freeform_tags | {} |
| defined_tags | {} |
| user_name | |
| repository_url | None |
| git_branch | None |
| git_commit | None |
| script_dir | /home/datascience/fraud_model_balanced |
| training_script | CAN_BE_INVALID:/home/datascience/fraud_model_deployment.ipynb |