import numpy as np #linear algebra
import pandas as pd #dataframe manipulation
import csv #excel file
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split #split train data
from sklearn.preprocessing import StandardScaler #Standardize features by removing the mean and scaling to unit variance.
from sklearn.neighbors import KNeighborsClassifier # Classifier implementing the k-nearest neighbors vote.
from sklearn.metrics import confusion_matrix #testing for false positive and negative
from sklearn.metrics import f1_score #Compute the F1 score, also known as balanced F-score or F-measure.
from sklearn.metrics import accuracy_score #The accuracy_score function computes the accuracy, either the fraction (default) or the count (normalize=False) of correct predictions.


df = pd.read_csv('creditcard.csv')

df


df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


count = 0;
for x in df.iloc[:,-1]:
    if (x == 1):
        count=count+1
print(len(df))
print(count)
ratio = count/len(df)
print(ratio)

284807
492
0.001727485630620034


y = df['Class']
y.value_counts().plot.pie(autopct='%.2f')

<AxesSubplot:ylabel='Class'>


#Define x train, x test, y train, y test
X = df.iloc[:,0:30]
y = df.iloc[:,30]
# random_state = 0:
##  Passes an int for reproducible output across multiple function calls.
# test_size = 0.2:
##   should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split.
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0, test_size = 0.2)


#normalize data

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# 11 neighbours for each point, p = 2 sets calculation of each point to the neighbours to use euclidean distance.
classifier = KNeighborsClassifier(n_neighbors = 11, p =2, metric = 'euclidean')
classifier.fit(X_train, y_train)
classifier.predict(X_test)

/opt/miniconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:237: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array([0, 0, 0, ..., 0, 0, 0])


from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt


y_scores = classifier.predict_proba(X_test)


for index, x in enumerate(y_scores):
  if (x[1] != 0) and (x[1] != 1):
    print(index,x)

67 [0.90909091 0.09090909]
159 [0.18181818 0.81818182]
365 [0.09090909 0.90909091]
1088 [0.90909091 0.09090909]
1241 [0.90909091 0.09090909]
1795 [0.90909091 0.09090909]
1976 [0.90909091 0.09090909]
3371 [0.90909091 0.09090909]
3682 [0.90909091 0.09090909]
4608 [0.81818182 0.18181818]
5286 [0.63636364 0.36363636]
6122 [0.36363636 0.63636364]
6893 [0.18181818 0.81818182]
6940 [0.18181818 0.81818182]
7418 [0.90909091 0.09090909]
7496 [0.45454545 0.54545455]
7924 [0.90909091 0.09090909]
8072 [0.18181818 0.81818182]
8139 [0.36363636 0.63636364]
8227 [0.09090909 0.90909091]
8297 [0.90909091 0.09090909]
8374 [0.09090909 0.90909091]
8754 [0.90909091 0.09090909]
8834 [0.90909091 0.09090909]
8914 [0.90909091 0.09090909]
9062 [0.54545455 0.45454545]
9126 [0.90909091 0.09090909]
9140 [0.18181818 0.81818182]
9270 [0.90909091 0.09090909]
9335 [0.90909091 0.09090909]
9365 [0.90909091 0.09090909]
9459 [0.90909091 0.09090909]
9744 [0.09090909 0.90909091]
9757 [0.18181818 0.81818182]
9825 [0.90909091 0.09090909]
9895 [0.81818182 0.18181818]
9941 [0.18181818 0.81818182]
9999 [0.90909091 0.09090909]
10291 [0.09090909 0.90909091]
10860 [0.36363636 0.63636364]
11003 [0.90909091 0.09090909]
11442 [0.90909091 0.09090909]
11522 [0.54545455 0.45454545]
11974 [0.90909091 0.09090909]
12618 [0.54545455 0.45454545]
13034 [0.81818182 0.18181818]
13402 [0.90909091 0.09090909]
13797 [0.27272727 0.72727273]
14842 [0.90909091 0.09090909]
14861 [0.90909091 0.09090909]
14955 [0.63636364 0.36363636]
15373 [0.63636364 0.36363636]
15378 [0.90909091 0.09090909]
16155 [0.90909091 0.09090909]
16224 [0.90909091 0.09090909]
16410 [0.81818182 0.18181818]
16460 [0.72727273 0.27272727]
17344 [0.90909091 0.09090909]
17834 [0.90909091 0.09090909]
18041 [0.90909091 0.09090909]
18490 [0.90909091 0.09090909]
19991 [0.54545455 0.45454545]
20240 [0.90909091 0.09090909]
20438 [0.90909091 0.09090909]
20536 [0.90909091 0.09090909]
20838 [0.90909091 0.09090909]
21067 [0.90909091 0.09090909]
21461 [0.90909091 0.09090909]
21676 [0.90909091 0.09090909]
22124 [0.63636364 0.36363636]
22226 [0.90909091 0.09090909]
22496 [0.72727273 0.27272727]
22980 [0.90909091 0.09090909]
23102 [0.09090909 0.90909091]
23112 [0.18181818 0.81818182]
23208 [0.90909091 0.09090909]
23346 [0.90909091 0.09090909]
24520 [0.09090909 0.90909091]
24725 [0.90909091 0.09090909]
25296 [0.90909091 0.09090909]
25369 [0.90909091 0.09090909]
25629 [0.72727273 0.27272727]
26516 [0.90909091 0.09090909]
26968 [0.90909091 0.09090909]
27509 [0.18181818 0.81818182]
27595 [0.18181818 0.81818182]
28031 [0.90909091 0.09090909]
28239 [0.90909091 0.09090909]
29946 [0.90909091 0.09090909]
30244 [0.09090909 0.90909091]
30250 [0.90909091 0.09090909]
30986 [0.27272727 0.72727273]
31059 [0.63636364 0.36363636]
31551 [0.90909091 0.09090909]
31836 [0.90909091 0.09090909]
32179 [0.09090909 0.90909091]
32385 [0.90909091 0.09090909]
33034 [0.90909091 0.09090909]
33083 [0.90909091 0.09090909]
33243 [0.90909091 0.09090909]
34378 [0.90909091 0.09090909]
35320 [0.54545455 0.45454545]
35435 [0.72727273 0.27272727]
36316 [0.81818182 0.18181818]
37127 [0.45454545 0.54545455]
37501 [0.09090909 0.90909091]
37652 [0.63636364 0.36363636]
39039 [0.90909091 0.09090909]
39539 [0.90909091 0.09090909]
39697 [0.90909091 0.09090909]
39947 [0.90909091 0.09090909]
40034 [0.90909091 0.09090909]
40399 [0.90909091 0.09090909]
40410 [0.90909091 0.09090909]
41067 [0.54545455 0.45454545]
41201 [0.09090909 0.90909091]
41321 [0.81818182 0.18181818]
41434 [0.63636364 0.36363636]
41488 [0.90909091 0.09090909]
41546 [0.45454545 0.54545455]
41714 [0.90909091 0.09090909]
41751 [0.90909091 0.09090909]
42029 [0.90909091 0.09090909]
42156 [0.45454545 0.54545455]
42431 [0.90909091 0.09090909]
42510 [0.63636364 0.36363636]
42564 [0.18181818 0.81818182]
42672 [0.18181818 0.81818182]
43191 [0.90909091 0.09090909]
43583 [0.90909091 0.09090909]
43681 [0.27272727 0.72727273]
43682 [0.90909091 0.09090909]
44402 [0.90909091 0.09090909]
44441 [0.90909091 0.09090909]
44464 [0.45454545 0.54545455]
44555 [0.90909091 0.09090909]
44834 [0.90909091 0.09090909]
45249 [0.09090909 0.90909091]
45430 [0.90909091 0.09090909]
45485 [0.63636364 0.36363636]
45792 [0.45454545 0.54545455]
46317 [0.90909091 0.09090909]
46434 [0.09090909 0.90909091]
46472 [0.90909091 0.09090909]
46903 [0.09090909 0.90909091]
47294 [0.90909091 0.09090909]
47371 [0.90909091 0.09090909]
47437 [0.90909091 0.09090909]
47918 [0.90909091 0.09090909]
48466 [0.90909091 0.09090909]
48742 [0.81818182 0.18181818]
48763 [0.90909091 0.09090909]
48969 [0.90909091 0.09090909]
49033 [0.90909091 0.09090909]
49377 [0.90909091 0.09090909]
49609 [0.90909091 0.09090909]
49630 [0.63636364 0.36363636]
49663 [0.09090909 0.90909091]
49692 [0.90909091 0.09090909]
49735 [0.90909091 0.09090909]
49889 [0.90909091 0.09090909]
50043 [0.90909091 0.09090909]
50983 [0.90909091 0.09090909]
51817 [0.54545455 0.45454545]
51940 [0.90909091 0.09090909]
52172 [0.09090909 0.90909091]
52244 [0.63636364 0.36363636]
52396 [0.90909091 0.09090909]
52623 [0.27272727 0.72727273]
52687 [0.09090909 0.90909091]
52927 [0.90909091 0.09090909]
52983 [0.81818182 0.18181818]
53012 [0.63636364 0.36363636]
53584 [0.90909091 0.09090909]
54204 [0.90909091 0.09090909]
55249 [0.90909091 0.09090909]
56539 [0.18181818 0.81818182]
56552 [0.90909091 0.09090909]


fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)


plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'b--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN classifier ')
plt.show()


pip install imbalanced-learn;

Requirement already satisfied: imbalanced-learn in /opt/miniconda3/lib/python3.9/site-packages (0.11.0)
Requirement already satisfied: scipy>=1.5.0 in /opt/miniconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.10.1)
Requirement already satisfied: scikit-learn>=1.0.2 in /opt/miniconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.1.2)
Requirement already satisfied: numpy>=1.17.3 in /opt/miniconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.23.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/miniconda3/lib/python3.9/site-packages (from imbalanced-learn) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /opt/miniconda3/lib/python3.9/site-packages (from imbalanced-learn) (1.3.2)
Note: you may need to restart the kernel to use updated packages.


import imblearn
from imblearn.over_sampling import SMOTE
from random import randrange, uniform
from sklearn.neighbors import NearestNeighbors


df.head(5)


df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64


df = df.drop(['Time'], axis=1)


X = df.drop(['Class'], axis=1)
X


y = df['Class']
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64


sm = SMOTE(sampling_strategy=ratio*1.5, random_state=100, k_neighbors=2)


X_res, y_res = sm.fit_resample(X, y)


y_res.value_counts()

0    284315
1       736
Name: Class, dtype: int64


X_train_SMOTE, X_test_SMOTE, y_train_SMOTE, y_test_SMOTE = train_test_split(X_res,y_res,random_state = 0, test_size = 0.2)

#normalize data

sc_X = StandardScaler()
X_train_SMOTE = sc_X.fit_transform(X_train_SMOTE)
X_test_SMOTE = sc_X.transform(X_test_SMOTE)

# 11 neighbours for each point, p = 2 sets calculation of each point to the neighbours to use euclidean distance.
classifier = KNeighborsClassifier(n_neighbors = 11, p =2, metric = 'euclidean')

#pfloat, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
classifier.fit(X_train_SMOTE, y_train_SMOTE)
classifier.predict(X_test_SMOTE)

/opt/miniconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:237: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)

array([0, 0, 0, ..., 0, 0, 0])


y_scores_SMOTE = classifier.predict_proba(X_test_SMOTE)


y_scores_SMOTE

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])


fpr, tpr, thresholds = roc_curve(y_test_SMOTE, y_scores_SMOTE[:, 1])
roc_auc_SMOTE = auc(fpr, tpr)

plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc_SMOTE)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'b--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN classifier with SMOTE ')
plt.show()


pip install -I jinja2==3.0.3

Collecting jinja2==3.0.3
  Downloading Jinja2-3.0.3-py3-none-any.whl (133 kB)
     |████████████████████████████████| 133 kB 4.1 MB/s eta 0:00:01
Collecting MarkupSafe>=2.0
  Using cached MarkupSafe-2.1.3-cp39-cp39-macosx_10_9_universal2.whl (17 kB)
Installing collected packages: MarkupSafe, jinja2
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorboard 2.10.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 4.22.1 which is incompatible.
flask 3.0.0 requires Jinja2>=3.1.2, but you have jinja2 3.0.3 which is incompatible.
Successfully installed MarkupSafe-2.1.3 jinja2-3.1.2
Note: you may need to restart the kernel to use updated packages.

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62	0
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69	0
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66	0
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50	0
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
284802	172786.0	-11.881118	10.071785	-9.834783	-2.066656	-5.364473	-2.606837	-4.918215	7.305334	1.914428	...	0.213454	0.111864	1.014480	-0.509348	1.436807	0.250034	0.943651	0.823731	0.77	0
284803	172787.0	-0.732789	-0.055080	2.035030	-0.738589	0.868229	1.058415	0.024330	0.294869	0.584800	...	0.214205	0.924384	0.012463	-1.016226	-0.606624	-0.395255	0.068472	-0.053527	24.79	0
284804	172788.0	1.919565	-0.301254	-3.249640	-0.557828	2.630515	3.031260	-0.296827	0.708417	0.432454	...	0.232045	0.578229	-0.037501	0.640134	0.265745	-0.087371	0.004455	-0.026561	67.88	0
284805	172788.0	-0.240440	0.530483	0.702510	0.689799	-0.377961	0.623708	-0.686180	0.679145	0.392087	...	0.265245	0.800049	-0.163298	0.123205	-0.569159	0.546668	0.108821	0.104533	10.00	0
284806	172792.0	-0.533413	-0.189733	0.703337	-0.506271	-0.012546	-0.649617	1.577006	-0.414650	0.486180	...	0.261057	0.643078	0.376777	0.008797	-0.473649	-0.818267	-0.002415	0.013649	217.00	0

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

	V1	V2	V3	V4	V5	V6	V7	V8	V9	V10	...	V20	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	0.090794	...	0.251412	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	-0.166974	...	-0.069083	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	0.207643	...	0.524980	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	-0.054952	...	-0.208038	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	0.753074	...	0.408542	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
284802	-11.881118	10.071785	-9.834783	-2.066656	-5.364473	-2.606837	-4.918215	7.305334	1.914428	4.356170	...	1.475829	0.213454	0.111864	1.014480	-0.509348	1.436807	0.250034	0.943651	0.823731	0.77
284803	-0.732789	-0.055080	2.035030	-0.738589	0.868229	1.058415	0.024330	0.294869	0.584800	-0.975926	...	0.059616	0.214205	0.924384	0.012463	-1.016226	-0.606624	-0.395255	0.068472	-0.053527	24.79
284804	1.919565	-0.301254	-3.249640	-0.557828	2.630515	3.031260	-0.296827	0.708417	0.432454	-0.484782	...	0.001396	0.232045	0.578229	-0.037501	0.640134	0.265745	-0.087371	0.004455	-0.026561	67.88
284805	-0.240440	0.530483	0.702510	0.689799	-0.377961	0.623708	-0.686180	0.679145	0.392087	-0.399126	...	0.127434	0.265245	0.800049	-0.163298	0.123205	-0.569159	0.546668	0.108821	0.104533	10.00
284806	-0.533413	-0.189733	0.703337	-0.506271	-0.012546	-0.649617	1.577006	-0.414650	0.486180	-0.915427	...	0.382948	0.261057	0.643078	0.376777	0.008797	-0.473649	-0.818267	-0.002415	0.013649	217.00

Predicting if credit card usage is fraud or not using KNN¶

Why kNN?¶

Now, to train our model, we must prepare the training and the testing sets to fit the model. We randomly select 80% and 20% of the data for training and testing sets, respectively.¶

Implementing SMOTE technique¶