Обнаружение фрод

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
# Read in the data from the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/dm-fedorov/ml/master/datasets/payment_fraud.csv')
df.sample(3)
accountAgeDays numItems localTime paymentMethod paymentMethodAgeDays label
31814 4 1 4.965339 creditcard 3.322222 0
27866 5 1 5.017904 creditcard 4.065278 0
33957 506 2 4.745402 creditcard 0.000000 0
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)
accountAgeDays numItems localTime paymentMethodAgeDays label paymentMethod_creditcard paymentMethod_paypal paymentMethod_storecredit
25716 975 1 5.040929 0.000000 0 1 0 0
36918 19 1 4.057414 0.001389 0 1 0 0
29477 1001 1 4.505662 0.000000 0 1 0 0
# Split dataset up into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.33, random_state=17)
X_train.sample(3)
accountAgeDays numItems localTime paymentMethodAgeDays paymentMethod_creditcard paymentMethod_paypal paymentMethod_storecredit
27527 2000 1 5.017904 886.436111 1 0 0
5915 915 1 4.921349 345.954861 0 1 0
27704 470 1 3.066058 322.661111 1 0 0
X_test.sample(3)
accountAgeDays numItems localTime paymentMethodAgeDays paymentMethod_creditcard paymentMethod_paypal paymentMethod_storecredit
16391 160 1 5.034622 0.000000 1 0 0
28741 1991 1 4.057414 1393.980556 1 0 0
5400 566 1 4.895263 0.000000 1 0 0
y_train.sample(3)
13004    0
19215    0
25897    0
Name: label, dtype: int64
y_test.sample(3)
5315     0
26059    1
22026    0
Name: label, dtype: int64
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)
y_pred
array([0, 0, 0, ..., 0, 0, 0])
# Compare test set predictions with ground truth labels
accuracy_score(y_pred, y_test)
1.0
print(confusion_matrix(y_test, y_pred))
[[12753     0]
 [    0   190]]