%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')
import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)


import ads
import os
from ads.dataset.factory import DatasetFactory
from ads.dataset.dataset_browser import DatasetBrowser


import ads.environment.ml_runtime
from ads.common.model import ADSModel
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.catalog.model import ModelSummaryList, ModelCatalog
from ads.catalog.project import ProjectSummaryList, ProjectCatalog
from ads.catalog.summary import SummaryList
from ads.common.model_artifact import ModelArtifact
ads.set_debug_mode(True)
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer

from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
import ads.environment.ml_runtime


import numpy as np
import pandas as pd
import sys
from scipy import stats
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.font_manager

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

from pyod.models.iforest import IForest
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from pyod.models.knn import KNN


#information such as file_name, name_space and bucket_name will be availale in Bucket Details. (Object Storage > Bucket Details)
ads.set_auth(auth='resource_principal')
bucket_name = 'Fraud_data'
file_name = 'fraud_creditcard.csv'
name_space = 'idhkis4m3p5e' 
storage_options = {'config': {'tenancy': os.environ['TENANCY_OCID'], 'region': os.environ['NB_REGION']}}


# creditcard.csv is loaded from object storage
ds = DatasetFactory.open(f"ocis://{bucket_name}/{file_name}", storage_options= {
    "config": "~/.oci/config",
    "profile": "DEFAULT"},
                        target = "Class",
                        type_discovery = False,
                        types = {'Class': 'category'})

Initializing:0.000s
Opening data:16.718s
Generating data sample:6.195s
Building dataset:0.486s


#checking for null values using seaborn package on ADS dataset
sns.heatmap(ds.isnull(), cbar=False)

<AxesSubplot:>


#To get a quick overview of all the column types and how the column’s values are distributed:
ds.show_in_notebook()


#visualizations of time and amount
plt.figure(figsize=(10,8))
plt.title('Distribution of Time Feature')
sns.distplot(ds.Time)

<AxesSubplot:title={'center':'Distribution of Time Feature'}, xlabel='Time', ylabel='Density'>


#To explore features, use the smart plot() method. It accepts one or two feature names. 
#The show_in_notebook() method automatically determines the best type of plot based on the type of features that are to be plotted.
ds.plot("Class").show_in_notebook(figsize=(4,4))


ds['Class'].value_counts().compute()

0    284315
1       492
Name: Class, dtype: int64


ds.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 31 entries, Time to Class
dtypes: float64(31)


#There are function to change datasets from Pandas DataFrame to ADS dataset and viceversa
data = ds.to_pandas_dataframe()


from imblearn.combine import SMOTETomek


#Create independent and Dependent Features
columns = data.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["Class"]]
# Store the variable we are predicting 
target = "Class"
# Define a random state 
state = np.random.RandomState(42)
X = data[columns]
Y = data[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)

(284807, 30)
(284807,)


# Implementing Oversampling for Handling Imbalanced using SMOTE-Tomek Links
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(X,Y)


X_res.shape,y_res.shape

((567562, 30), (567562,))


balanced_data = X_res
balanced_data["Class"] = y_res
balanced_data.head()


from collections import Counter
print('Original dataset shape {}'.format(Counter(Y)))
print('Resampled dataset shape {}'.format(Counter(y_res)))

Original dataset shape Counter({0: 284315, 1: 492})
Resampled dataset shape Counter({0: 283781, 1: 283781})


count_classes = balanced_data.value_counts(balanced_data['Class'], sort = True)
count_classes.plot(kind = 'bar', rot=0)
plt.title("Transaction Class Distribution")
plt.xlabel("Class")
plt.ylabel("Frequency")

Text(0, 0.5, 'Frequency')


#we convert the Pandas Dataframe back to ADS Dataset
balanced_ds = DatasetFactory.open(balanced_data,
                        target = "Class",
                        type_discovery = False,
                        types = {'Class': 'category'})

Initializing:0.000s
Opening data:0.167s
Generating data sample:0.129s
Building dataset:0.448s


# view summary of balanced data, notice that exactly 50% of the "Class" attribute has zeros, the remaining 50% has ones
balanced_ds.show_in_notebook()


balanced_ds.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 31 entries, Time to Class
dtypes: float64(29), int32(1), int8(1)


# the dataset is split into train, test
train, test = balanced_ds.train_test_split(test_size=0.2)


# AutoML is used to automate algorithm selection
# different models are trained to solve the classification problem by using Oracle AutoML
ml_engine = OracleAutoMLProvider(n_jobs=-1, loglevel=logging.ERROR)
automl = AutoML(train, provider=ml_engine)
model, baseline = automl.train(model_list=[
    'LogisticRegression',
    'LGBMClassifier',
    'XGBClassifier',
    'RandomForestClassifier'], time_budget=10)


# print different trials that were run during the AutoML training period
automl.print_trials(max_rows=20, sort_column='Mean Validation Score')


# View algorithm performance
#the closer to 
automl.visualize_algorithm_selection_trials()


# View hyperparameter tuning trials by AutoML
automl.visualize_tuning_trials()


# the accuracy of the model checked using the test data
accuracy = sum(test.y == model.predict(test.X)) / len(test.y) # Check accuracy of model
print("The accuracy of the model is:", accuracy)

The accuracy of the model is: 0.9994631793860884


test_arr_x = np.asarray(test.X)
test_arr_y = np.asarray(test.y)
len(test_arr_y)

113632


train_arr_x = np.asarray(train.X)
train_arr_y = np.asarray(train.y)
len(train_arr_y)

453930


predict_test = np.asarray(model.predict(test.X))
num_fraud = 0
correct_prediction = 0

for i in range(len(test_arr_y)):
    
    if test_arr_y[i] == 1:
        num_fraud +=1
        
        if test_arr_y[i] == predict_test[i]:
            correct_prediction +=1
            


print(correct_prediction)
print(num_fraud)
correct_prediction/num_fraud

56749
56751

0.9999647583302497


evaluator = ADSEvaluator(test, models=[model])
evaluator.show_in_notebook()


# our model explainer class
explainer = ADSExplainer(test, model)

# let's created a global explainer
global_explainer = explainer.global_explanation(provider=MLXGlobalExplainer())


# Compute importance values and display top 20 in the notebook:

importances = global_explainer.compute_feature_importance()
importances.show_in_notebook(n_features=20)


# We prepare the model artifact
model_artifact = model.prepare("/home/datascience/fraud_model_balanced/",
                               force_overwrite=True, 
                               data_sample=test,
                               include_data_sample=True,
                              data_science_env=True)

Initializing:0.000s
Preparing Model Artifact Directory:0.007s
Serializing model:5.656s
Generating schema:13.492s
Updating requirements.txt:0.024s
Creating runtime.yaml configuration:0.135s
Writing func.yaml:0.005s
Writing func.py:0.015s


# This shows all the files that exist within the model artifact directory
!ls /home/datascience/fraud_model_balanced/

data-sample.json  model.onnx		      requirements.txt	score.py
func.py		  onnx_data_transformer.json  runtime.yaml
func.yaml	  __pycache__		      schema.json


# Before the model can be saved we need to know our compartment id and project id
compartment_id = os.environ["NB_SESSION_COMPARTMENT_OCID"]
project_id = os.environ["PROJECT_OCID"]


# save model artifact to model catalog
model_artifact.save(project_id=project_id,
                   compartment_id=compartment_id,
                  display_name="fraud_model_balanced",
                  description="Fraud detection model built with AutoML with balanced dataset",
                  training_script_path="fraud_model_deployment.ipynb",
                  ignore_pending_changes=True)

Initializing:0.001s
Creating model in catalog:0.344s
Generating model artifact zip:0.329s
Uploading model artifact:18.345s
Save provenance metadata:0.317s

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

Algorithm	#Samples	#Features	Mean Validation Score	Hyperparameters	CPU Time
LGBMClassifier_HT	453930	30	-0.0041	{'boosting_type': 'gbdt', 'class_weight': 'balanced', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0, 'reg_lambda': 1}	152.0992
LGBMClassifier_AS	22696	30	-0.0069	{'boosting_type': 'gbdt', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_weight': 0.001, 'n_estimators': 100, 'num_leaves': 31, 'reg_alpha': 0, 'reg_lambda': 1, 'class_weight': 'balanced'}	12.6054
XGBClassifier_AS	22696	30	-0.0384	{'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1}	46.9292
RandomForestClassifier_AS	22696	30	-0.0417	{'n_estimators': 100, 'class_weight': 'balanced', 'max_features': 0.777777778, 'min_samples_leaf': 0.000625, 'min_samples_split': 0.00125}	359.4325
LogisticRegression_AS	22696	30	-0.0621	{'C': 1.0, 'class_weight': 'balanced', 'solver': 'liblinear'}	6.1619


id	ocid1.datasciencemodel.oc1.iad.amaaaaaatwfhi7ya3x5x4icfmr75uxqhsillo4j67ejoon5dwu37xzav3jmq
compartment_id	ocid1.compartment.oc1..aaaaaaaa4yvgb3xrr7idbzkzaisrdkwiimwmpemya7iaehqugcdv4egvpxaa
project_id	ocid1.datascienceproject.oc1.iad.amaaaaaatwfhi7yakzcke2uuehttf322fpserpjyphzwft27la5bktxkfzcq
display_name	fraud_model_balanced
description	Fraud detection model built with AutoML with balanced dataset
lifecycle_state	ACTIVE
time_created	2021-08-23 00:19:52.619000+00:00
created_by	ocid1.datasciencenotebooksession.oc1.iad.amaaaaaatwfhi7yae7nxvcmggfllcb25gb2xdb3l3dnm7lqudwx2ssxgawla
freeform_tags	{}
defined_tags	{}
user_name
repository_url	None
git_branch	None
git_commit	None
script_dir	/home/datascience/fraud_model_balanced
training_script	CAN_BE_INVALID:/home/datascience/fraud_model_deployment.ipynb

Fraud Detection Classification¶

Importing Packages¶

Getting data from Object Storage¶

Exploaratory Data Analysis using the Oracle Accelerated Data Science (ADS) SDK¶

Through our EDA we see that the data used is highly unbalanced. The ADS feature .show_in_notebook warns us that Class has 284315 (99.83%) zeros. Our visualization confirms that as well. Therefore, the next thing we need to do is preprocess the unbalanced data as to avoid as much bias as we can.¶

Preprocessing - Oversampling¶

We convert the ds format of data to a Pandas Data frame to preprocess the data with certain packages.¶

As you can see below that now the number of class instances are same for both fraud and normal¶

Model Training and Testing using AutoML¶

Splitting the Data (Test/Train)¶

Using AutoML, several models are trained with the dataset and the most effective model is output¶

Visualization and Accuracy¶

Model Evaluation and Explaination using ADS¶

Model Evalution¶

Model Explanation¶

Saving ML model to Model Catalog¶

What is Oracle Model Catalog?¶

Model Saving¶

Here you can see that the model has been saved under "Models" in the same Project¶

Model Deployement explained in the website.¶