import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrixОбнаружение фрод
# Read in the data from the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/dm-fedorov/ml/master/datasets/payment_fraud.csv')df.sample(3)| accountAgeDays | numItems | localTime | paymentMethod | paymentMethodAgeDays | label | |
|---|---|---|---|---|---|---|
| 31814 | 4 | 1 | 4.965339 | creditcard | 3.322222 | 0 |
| 27866 | 5 | 1 | 5.017904 | creditcard | 4.065278 | 0 |
| 33957 | 506 | 2 | 4.745402 | creditcard | 0.000000 | 0 |
# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])
df.sample(3)| accountAgeDays | numItems | localTime | paymentMethodAgeDays | label | paymentMethod_creditcard | paymentMethod_paypal | paymentMethod_storecredit | |
|---|---|---|---|---|---|---|---|---|
| 25716 | 975 | 1 | 5.040929 | 0.000000 | 0 | 1 | 0 | 0 |
| 36918 | 19 | 1 | 4.057414 | 0.001389 | 0 | 1 | 0 | 0 |
| 29477 | 1001 | 1 | 4.505662 | 0.000000 | 0 | 1 | 0 | 0 |
# Split dataset up into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
df.drop('label', axis=1), df['label'],
test_size=0.33, random_state=17)X_train.sample(3)| accountAgeDays | numItems | localTime | paymentMethodAgeDays | paymentMethod_creditcard | paymentMethod_paypal | paymentMethod_storecredit | |
|---|---|---|---|---|---|---|---|
| 27527 | 2000 | 1 | 5.017904 | 886.436111 | 1 | 0 | 0 |
| 5915 | 915 | 1 | 4.921349 | 345.954861 | 0 | 1 | 0 |
| 27704 | 470 | 1 | 3.066058 | 322.661111 | 1 | 0 | 0 |
X_test.sample(3)| accountAgeDays | numItems | localTime | paymentMethodAgeDays | paymentMethod_creditcard | paymentMethod_paypal | paymentMethod_storecredit | |
|---|---|---|---|---|---|---|---|
| 16391 | 160 | 1 | 5.034622 | 0.000000 | 1 | 0 | 0 |
| 28741 | 1991 | 1 | 4.057414 | 1393.980556 | 1 | 0 | 0 |
| 5400 | 566 | 1 | 4.895263 | 0.000000 | 1 | 0 | 0 |
y_train.sample(3)13004 0
19215 0
25897 0
Name: label, dtype: int64
y_test.sample(3)5315 0
26059 1
22026 0
Name: label, dtype: int64
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)
# Make predictions on test set
y_pred = clf.predict(X_test)y_predarray([0, 0, 0, ..., 0, 0, 0])
# Compare test set predictions with ground truth labels
accuracy_score(y_pred, y_test)1.0
print(confusion_matrix(y_test, y_pred))[[12753 0]
[ 0 190]]