In [1]:
import numpy as np
from scipy.stats import gmean, trim_mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score
from itertools import combinations
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import learning_curve, ShuffleSplit

class stats:
    def __init__(self,data):
        self.data = data
        self.min = self.data.min()
        self.max = self.data.max()
        self.range = self.data.max() - self.data.min()

    def info(self):
        return self.data.dtypes

    def variables(self,var):

        # Detect variable types in dataset:
            # Category, Numerical, Bool

        liste = []
        for i in range(0, self.data.shape[1]):
            col = self.data.dtypes[i]
            if (var == 0):
                if (col == "object"):
                    liste.append(i)
            elif (var == 1):
                if (col == "int64" or col == "float64"):
                    liste.append(i)
            elif (var == 2):
                if (col == "bool"):
                    liste.append(i)
        return liste

    def desc_stats(self):

        # Desciriptive statistics of the whole data:
            # Count, Distinct, NA, Sum, Mean, Median, Mode, Min, %25q, %50q, %75q,
            # Max, Range, Std, Var, SE Mean, GMean, Trim Mean, Skewness, Kurtosis

        results = np.ndarray(shape=(self.data.shape[1], 19), dtype=float, order='F')
        results[:, 0] = self.data.count()
        results[:, 1] = self.data.nunique()
        results[:, 2] = round(self.data.sum(),2)
        results[:, 3] = self.data.mean()
        results[:, 4] = self.data.median()
        results[:, 5] = self.data.mode().mean()
        results[:, 6] = self.min
        results[:, 7] = self.data.quantile(0.25)
        results[:, 8] = self.data.quantile(0.50)
        results[:, 9] = self.data.quantile(0.75)
        results[:, 10] = self.max
        results[:, 11] = self.range
        results[:, 12] = self.data.std()
        results[:, 13] = self.data.var()
        results[:, 14] = self.data.sem()
        results[:, 15] = np.transpose(gmean(self.data))
        results[:, 16] = np.transpose(trim_mean(self.data, 0.1))
        results[:, 17] = self.data.skew()
        results[:, 18] = self.data.kurtosis()

        Columns = ['Count', 'Distinct', 'Sum', 'Mean', 'Median', 'Mode', 'Min', '%25', '%50', '%75', 'Max', 'Range', 'Std',
               'Var', 'S.E. Mean', 'Gmean', 'Trim Mean', 'Skewness', 'Kurtosis']
        stats = pd.DataFrame(results, columns=Columns, index= self.data.columns.values)
    #norm = np.ndarray(shape=(data.shape[1], 2), dtype=float, order='F')
    #for i in range(0,data.shape[1]):
    #   norm[i] = shapiro(data[i])
    #norm = pd.DataFrame(norm, columns=['Statistic', 'P-value']).round(4)
        return stats
    @property
    def interval(self,col,sınıf):
        k = round(self.range/sınıf)
        for i in range(int(self.min[0]),int(self.max[0])):
            cf = self.data[(self.data.ix[:, col] > i) & (self.data.ix[:, col] < (i+k))].shape[0]
            df = pd.DataFrame([i, i+k,cf], columns=['start','end','frequency'])
            i += k
        return df

    def frequency(self,col):
        df = pd.value_counts(self.data.ix[:,col]).to_frame().reset_index()
        df.sort_values(by='index')
        return df

    def descriptive_graph(stats,col):
        plt.hist(stats[col])
        plt.ylabel('Probability')
        plt.title('Histogram of ' + stats.columns[col])
        plt.text(60, .025, r'$\mu=100,\ \sigma=15$')
        plt.axis([40, 160, 0, 0.03])
        plt.grid(True)
        plt.show()

def plot_confusion_matrix(y_true, y_pred,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots(1,1, figsize=(11,6))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

def graph_roc_curve_multiple(y,log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr):
    plt.figure(figsize=(16,8))
    plt.title('ROC Curve \n Top 4 Classifiers', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y, log_reg_pred)))
    plt.plot(knear_fpr, knear_tpr, label='KNears Neighbors Classifier Score: {:.4f}'.format(roc_auc_score(y, knears_pred)))
    plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(roc_auc_score(y, svc_pred)))
    plt.plot(tree_fpr, tree_tpr, label='Decision Tree Classifier Score: {:.4f}'.format(roc_auc_score(y, tree_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (minimum score)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()

def model_result(estimator,x,y,x_t,y_t,c):
    cv_score = cross_val_score(estimator, x, y, cv = c)
    np.set_printoptions(precision=2)
    y_pred = estimator.predict(x_t)
    print('---' * 15)
    print(type(estimator).__name__)
    print('---' * 15)
    print('CV Score: ', round(cv_score.mean() * 100, 2).astype(str) + '%')
    print('---' * 15)
    print('Recall Score: {:.2f}'.format(recall_score(y_t, y_pred)))
    print('Precision Score: {:.2f}'.format(precision_score(y_t, y_pred)))
    print('F1 Score: {:.2f}'.format(f1_score(y_t, y_pred)))
    print('Accuracy Score: {:.2f}'.format(accuracy_score(y_t, y_pred)))
    print('---' * 15)
    plot_confusion_matrix(y_t, y_pred,
                      title= type(estimator).__name__ + ' Rakamlarla')

    plot_confusion_matrix(y_t, y_pred, normalize=True,
                      title=type(estimator).__name__ + " %'lik")

    plt.show()

def comb(full_data,comb_val):
    x=list()
    comb=combinations(full_data,comb_val)
    for i in list(comb):
        x.append(i)
    return x

def compare_corr(df):
    f, ax = plt.subplots(1, 1, figsize=(df.shape[1]+1,df.shape[1]+1))
    corr = df.corr()
    sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':df.shape[1]}, ax=ax)
    ax.set_title('Korrelasyon Matrisi', fontsize=14)
    plt.show()

    top_corr_features = corr.index
    plt.figure(figsize=(df.shape[1],df.shape[1]))

    g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

def compare_distribution(df):
    for j in range(0,df.shape[1]-1):
        dims = (15,5)
        fig, ax = plt.subplots(figsize=(dims))
        v1 = df[df.columns[j]][df.iloc[:,-1] == 1]
        v2 = df[df.columns[j]][df.iloc[:,-1] == 0]

        sns.distplot( v1 , ax=ax, color="r", label="sado", axlabel=str(df.columns[j]) + '. Feature')
        sns.distplot( v2 , ax=ax ,  color="g", label="mazo", axlabel=str(df.columns[j]) + '. Feature')
        plt.show()

def impute_space(df):
    '''
    Fill spaces with 0 in data points
    '''
    for i in range(0,df.shape[1]-1):
        if df[i].dtypes == 'object':
            df[i] = pd.to_numeric(df[i],errors='coerce')
            df[i] = df[i].fillna(0)
    return df

def dtype_print(df):
    '''
    Print data types of all columns in df
    '''
    for i in range(full.shape[1]-1):
        print(i , ' ', full[i].dtypes)
        print('*'*20)

def main_graph(df,header,xlabel):
    '''
    Print column & row & labels number
    Plot distribution of labels
    '''
    print('Sütun Sayısı: %s' % str(df.shape[1]))
    print('Satır Sayısı: %s' % str(df.shape[0]))
    print(df.iloc[:,-1].value_counts())

    colors = ["b", "r"]
    sns.countplot(df.columns[-1], data=df, palette=colors)

    #pd.value_counts(df.iloc[:,-1], sort = True).sort_index().plot(kind = 'bar')
    plt.title(header)
    plt.xlabel(xlabel)

def filter_zero_cols(df,threshold):
    '''
    Filter zero columns in df
    '''
    flist = [0]
    for i in range(1,df.shape[1]-1):
        zero = df.iloc[:,i].value_counts()[0]
        zero_rate = zero / df.shape[0]
        if zero_rate < threshold:
            flist.append(i)
    flist.append('IS_FRAUD')
    df = df.loc[:,flist]
    return df

def filter_zero_rows(df):
    '''
    Filter zero rows
    '''
    df = df.loc[(df.iloc[:,1:df.shape[1]-1]!=0).any(axis=1)]
    return df

def undersample(df,rs):
    '''
    Random under sampling to make equal ratios 
    '''
    df = df.sample(frac=1, random_state=rs)
    full_f = df.loc[df.iloc[:,-1] == 1]
    full_nf = df.loc[df.iloc[:,-1] == 0][:df[df.iloc[:,-1] == 1].shape[0]]
    normal_distributed_df = pd.concat([full_f, full_nf])
    df = normal_distributed_df.sample(frac=1, random_state=rs)
    return df

def rb_scale(df):
    '''
    Robust scale of each cols in df
    '''
    rb_scaler = RobustScaler()
    for i in range(1,df.shape[1]-1):
        df.iloc[:,i] =  rb_scaler.fit_transform(df.iloc[:,i].values.reshape(-1,1))
    return df

def compare_boxplot(df):
    '''
    Plot boxplots for each cols in df
    '''
    for i in range(0,df.shape[1]-1):
        sns.set(style="ticks", palette="pastel")
        sns.boxplot(x=full_5.columns[-1], y=df.columns[i], palette=["m", "g"], data=df)
        sns.despine(offset=10, trim=True)
        plt.show()

def feature_select_tree(df):
    '''
    Feature selection extra trees classifier
    '''
    model = ExtraTreesClassifier()
    model.fit(df.iloc[:,:-1],df.iloc[:,-1])
    print(model.feature_importances_)
    feat_importances = pd.Series(model.feature_importances_, index=df.iloc[:,:-1].columns)
    feat_importances.nlargest(10).plot(kind='barh')
    plt.show()

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


def learning(full_data, estimators, iter_number, t_size):
    X=full_data.iloc[:, :-1]
    y=full_data.iloc[:, -1]
    for estimator in estimators:
        title=type(estimator).__name__
        cv = ShuffleSplit(n_splits=iter_number, test_size=t_size, random_state=0)
        plot_learning_curve(estimator, title, X, y, cv=cv, n_jobs=4)
    plt.show()

def compare_pairplot(df):
    g = sns.pairplot(df, hue=df.columns[-1], palette="husl")

def hyper_tune(X,Y):
    # Logistic Regression 
    'log_reg_params = {"penalty": ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}'
    grid_log_reg = GridSearchCV(LogisticRegression(solver='lbfgs'), log_reg_params, n_jobs = -1)
    grid_log_reg.fit(X, Y)
    log_reg = grid_log_reg.best_estimator_

    # KNN
    knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
    grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params, n_jobs = -1)
    grid_knears.fit(X, Y)
    knears_neighbors = grid_knears.best_estimator_

    # Support Vector Classifier
    svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
    grid_svc = GridSearchCV(SVC(gamma='scale'), svc_params, n_jobs = -1)
    grid_svc.fit(X, Y)
    svc_best = grid_svc.best_estimator_

    # DecisionTree Classifier
    tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)),
              "min_samples_leaf": list(range(5,7,1))}
    grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params, n_jobs = -1)
    grid_tree.fit(X, Y)
    tree_clf = grid_tree.best_estimator_

    # RandomForest Classifier
    forest_params = {'n_estimators': [100, 300, 500, 800, 1200], 'max_depth': [5, 8, 15, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}

    grid_forest = GridSearchCV(RandomForestClassifier(random_state = 1), forest_params, n_jobs = -1)
    grid_forest.fit(X, Y)
    forest_best = grid_forest.best_estimator_

    estimators = [log_reg, knears_neighbors, svc_best, tree_clf, forest_best]
    return estimators
C:\ProgramData\Anaconda3\envs\tensorflow_env\lib\site-packages\sklearn\externals\joblib\__init__.py:15: DeprecationWarning: sklearn.externals.joblib is deprecated in 0.21 and will be removed in 0.23. Please import this functionality directly from joblib, which can be installed with: pip install joblib. If this warning is raised when loading pickled models, you may need to re-serialize those models with scikit-learn 0.21+.
  warnings.warn(msg, category=DeprecationWarning)
Using TensorFlow backend.
In [2]:
# Read nonfraud CSV 
csv_list_nf = ['1.csv','2.csv','3.csv','4.csv','5.csv','6.csv','7.csv','8.csv','9.csv','10.csv','11.csv','12.csv','13.csv']
data_nf = pd.read_csv('0.csv', header=None, sep = '|')
for csv in csv_list_nf:
    data_nf = data_nf.append(pd.read_csv(csv, header=None, sep = '|'))

print('NonFraud veri formatı:')
print(data_nf.shape)

## Remove Duplicate Data
data_nf = data_nf.drop_duplicates(subset=0, keep='first')
print('Mükerrer kayıtların silinmesi sonrası format:')
print(data_nf.shape)

## Remove Extra Features
data_nf = data_nf.drop([42,87,129,174], axis=1)
print('Fazlalık featureların silinmesi sonrası format:')
print(data_nf.shape)

## Change column names
data_nf.columns = list(range(0,171))

## Add Label
data_nf['IS_FRAUD'] = 0
print('Label eklenmesi sonrası format:')
print(data_nf.shape)
NonFraud veri formatı:
(112314, 175)
Mükerrer kayıtların silinmesi sonrası format:
(99583, 175)
Fazlalık featureların silinmesi sonrası format:
(99583, 171)
Label eklenmesi sonrası format:
(99583, 172)
In [3]:
# Read fraud CSV
def frame(headers_csv,data_csv):
    headers = pd.read_csv(headers_csv, header=None, sep = ',')
    dt = pd.read_csv(data_csv, header=headers, sep = ',')
    return dt


csv_list_f = ['MOFeatures.csv','MTFeatures.csv','SMSFeatures.csv']
data_f0 = pd.read_csv('MOFeatures.csv', header=None, sep = ',')
data_f1 = pd.read_csv('MTFeatures.csv', header=None, sep = ',')
data_f2 = pd.read_csv('SMSFeatures.csv', header=None, sep = ',')

data_f = pd.merge(data_f0,data_f1,how='inner', on=0)
data_f = pd.merge(data_f,data_f2,how='inner', on=0)
data_f[0] = data_f[0].astype(str).str[1:].astype(np.int64)

print('Fraud veri formatı:')
print(data_f.shape)

## Remove Duplicate Data
data_f = data_f.drop_duplicates(subset=0, keep='first')
print('Mükerrer kayıtların silinmesi sonrası format:')
print(data_f.shape)

## Order Features
data_f.columns = data_f.columns.astype(str)
cols = pd.read_csv('cols.csv', header=None, sep = ',')
a = cols.values.tolist()

full = pd.DataFrame()
full[0] = data_f['0'].values
for i in range(0,cols.shape[0]):
    full[i+1] = data_f[a[i][0].replace("'", "")]
full['IS_FRAUD'] = 1

print('Label eklenmesi sonrası format:')
print(full.shape)
C:\ProgramData\Anaconda3\envs\tensorflow_env\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (24,57,83,116) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Fraud veri formatı:
(11390, 171)
Mükerrer kayıtların silinmesi sonrası format:
(11390, 171)
Label eklenmesi sonrası format:
(11390, 172)
In [4]:
# Concatanate Fraud & NonFraud
full = full.append(data_nf)
full.index = full[0]
print(full.shape)
print(full)
(110973, 172)
                     0  1  2  3  4  5  6  7  8  9  ...  162  163  164  165  \
0                                                  ...
5050000181  5050000181  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5050000203  5050000203  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5050000265  5050000265  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5050000929  5050000929  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5050001299  5050001299  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
...                ... .. .. .. .. .. .. .. .. ..  ...  ...  ...  ...  ...
5558143164  5558143164  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5558315811  5558315811  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5558931816  5558931816  0  0  0  0  0  0  0  0  0  ...    0    0    0    0
5558948655  5558948655  0  0  0  0  0  0  1  2  2  ...    1    0    0    0
5558988407  5558988407  0  0  0  0  0  0  0  0  0  ...    0    0    0    0

             166   167   168  169  170  IS_FRAUD
0
5050000181     0     0     0    0    0         1
5050000203    24    91    61    0    0         1
5050000265     0     0     0    0    0         1
5050000929     0     0     0    0    0         1
5050001299     0     0     0    0    0         1
...          ...   ...   ...  ...  ...       ...
5558143164     6   458   179    0    0         0
5558315811     0     0     0    0    0         0
5558931816   283   601  1111    0    0         0
5558948655  1523  1617   752    0    0         0
5558988407   159  1764  1082    0    0         0

[110973 rows x 172 columns]
In [5]:
full_1 = impute_space(full)
dtype_print(full_1)
0   int64
********************
1   int64
********************
2   int64
********************
3   int64
********************
4   int64
********************
5   int64
********************
6   int64
********************
7   int64
********************
8   int64
********************
9   int64
********************
10   int64
********************
11   int64
********************
12   int64
********************
13   int64
********************
14   int64
********************
15   int64
********************
16   int64
********************
17   int64
********************
18   int64
********************
19   int64
********************
20   int64
********************
21   int64
********************
22   int64
********************
23   int64
********************
24   int64
********************
25   int64
********************
26   int64
********************
27   int64
********************
28   int64
********************
29   int64
********************
30   float64
********************
31   int64
********************
32   int64
********************
33   int64
********************
34   int64
********************
35   int64
********************
36   int64
********************
37   int64
********************
38   int64
********************
39   int64
********************
40   int64
********************
41   int64
********************
42   int64
********************
43   int64
********************
44   int64
********************
45   int64
********************
46   int64
********************
47   int64
********************
48   int64
********************
49   int64
********************
50   int64
********************
51   int64
********************
52   int64
********************
53   int64
********************
54   int64
********************
55   int64
********************
56   int64
********************
57   int64
********************
58   int64
********************
59   int64
********************
60   int64
********************
61   int64
********************
62   int64
********************
63   int64
********************
64   int64
********************
65   int64
********************
66   int64
********************
67   int64
********************
68   int64
********************
69   int64
********************
70   int64
********************
71   int64
********************
72   int64
********************
73   int64
********************
74   int64
********************
75   int64
********************
76   int64
********************
77   int64
********************
78   int64
********************
79   int64
********************
80   int64
********************
81   int64
********************
82   int64
********************
83   int64
********************
84   int64
********************
85   int64
********************
86   int64
********************
87   int64
********************
88   int64
********************
89   int64
********************
90   int64
********************
91   int64
********************
92   int64
********************
93   int64
********************
94   int64
********************
95   int64
********************
96   int64
********************
97   int64
********************
98   int64
********************
99   int64
********************
100   int64
********************
101   int64
********************
102   int64
********************
103   int64
********************
104   int64
********************
105   int64
********************
106   int64
********************
107   int64
********************
108   int64
********************
109   int64
********************
110   int64
********************
111   int64
********************
112   int64
********************
113   int64
********************
114   int64
********************
115   float64
********************
116   int64
********************
117   int64
********************
118   int64
********************
119   int64
********************
120   int64
********************
121   int64
********************
122   int64
********************
123   int64
********************
124   int64
********************
125   int64
********************
126   int64
********************
127   int64
********************
128   int64
********************
129   int64
********************
130   int64
********************
131   int64
********************
132   int64
********************
133   int64
********************
134   int64
********************
135   int64
********************
136   int64
********************
137   int64
********************
138   int64
********************
139   int64
********************
140   int64
********************
141   int64
********************
142   int64
********************
143   int64
********************
144   int64
********************
145   int64
********************
146   int64
********************
147   int64
********************
148   int64
********************
149   int64
********************
150   int64
********************
151   int64
********************
152   int64
********************
153   int64
********************
154   int64
********************
155   int64
********************
156   int64
********************
157   int64
********************
158   int64
********************
159   int64
********************
160   int64
********************
161   int64
********************
162   int64
********************
163   int64
********************
164   int64
********************
165   int64
********************
166   int64
********************
167   int64
********************
168   int64
********************
169   int64
********************
170   int64
********************
In [6]:
main_graph(full_1,'Fraud Dağılımı',"Non-Fraud            &&&            Fraud")
Sütun Sayısı: 172
Satır Sayısı: 110973
0    99583
1    11390
Name: IS_FRAUD, dtype: int64
In [7]:
full_2 = filter_zero_cols(full_1,0.95)
print(full_2)
                     0  13  14  15  16  17  18  25  26  27  ...  138  139  \
0                                                           ...
5050000181  5050000181   0   0   0   0   0   0   0   0   0  ...    0    0
5050000203  5050000203   0   0   0   0   0   0   0   0   0  ...    0    2
5050000265  5050000265   0   0   0   0   0   0   0   0   0  ...    0    0
5050000929  5050000929   0   0   0   0   0   0   0   0   0  ...    0    0
5050001299  5050001299   0   0   0   0   0   0   0   0   0  ...    0    0
...                ...  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...
5558143164  5558143164   5   7  25   0   0   0   5   7  25  ...   41    0
5558315811  5558315811   2   3   1   0   0   0   2   3   1  ...    1    0
5558931816  5558931816  20  39  60   0   0   0  20  39  60  ...  165    0
5558948655  5558948655  10  20  92   0   0   0  11  22  94  ...  116    0
5558988407  5558988407  27  49  65   0   0   0  29  52  68  ...  105    0

            140  142  143  144   166   167   168  IS_FRAUD
0
5050000181    0    0    0    0     0     0     0         1
5050000203    7    0    0    0    24    91    61         1
5050000265    0    0    0    0     0     0     0         1
5050000929    0    0    0    0     0     0     0         1
5050001299    0    0    0    0     0     0     0         1
...         ...  ...  ...  ...   ...   ...   ...       ...
5558143164    0    4    9    9     6   458   179         0
5558315811    0    3    3    1     0     0     0         0
5558931816    0   17   34   45   283   601  1111         0
5558948655    0   14   31   96  1523  1617   752         0
5558988407    0   33  108  173   159  1764  1082         0

[110973 rows x 80 columns]
In [8]:
full_3 = filter_zero_rows(full_2)
print(full_3)
                     0  13  14  15  16  17  18  25  26  27  ...  138  139  \
0                                                           ...
5050000181  5050000181   0   0   0   0   0   0   0   0   0  ...    0    0
5050000203  5050000203   0   0   0   0   0   0   0   0   0  ...    0    2
5050000265  5050000265   0   0   0   0   0   0   0   0   0  ...    0    0
5050000929  5050000929   0   0   0   0   0   0   0   0   0  ...    0    0
5050001299  5050001299   0   0   0   0   0   0   0   0   0  ...    0    0
...                ...  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...
5558143164  5558143164   5   7  25   0   0   0   5   7  25  ...   41    0
5558315811  5558315811   2   3   1   0   0   0   2   3   1  ...    1    0
5558931816  5558931816  20  39  60   0   0   0  20  39  60  ...  165    0
5558948655  5558948655  10  20  92   0   0   0  11  22  94  ...  116    0
5558988407  5558988407  27  49  65   0   0   0  29  52  68  ...  105    0

            140  142  143  144   166   167   168  IS_FRAUD
0
5050000181    0    0    0    0     0     0     0         1
5050000203    7    0    0    0    24    91    61         1
5050000265    0    0    0    0     0     0     0         1
5050000929    0    0    0    0     0     0     0         1
5050001299    0    0    0    0     0     0     0         1
...         ...  ...  ...  ...   ...   ...   ...       ...
5558143164    0    4    9    9     6   458   179         0
5558315811    0    3    3    1     0     0     0         0
5558931816    0   17   34   45   283   601  1111         0
5558948655    0   14   31   96  1523  1617   752         0
5558988407    0   33  108  173   159  1764  1082         0

[110357 rows x 80 columns]
In [9]:
full_4 = undersample(full_3,15)
full_41= full_4
print(full_4)
                     0  13  14   15  16  17  18  25  26   27  ...  138  139  \
0                                                             ...
5519642725  5519642725  67  82  471   2   4   1  67  82  471  ...  187    0
5551089384  5551089384  28  36  189   3   3   0  28  36  189  ...  105    2
5077374179  5077374179   0   0    0   0   0   0   0   0    0  ...    0    1
5055496631  5055496631   4   6    5   0   0   0   4   6    5  ...    9    0
5338188555  5338188555  19  21   96   2   2   0  19  21   96  ...   14    1
...                ...  ..  ..  ...  ..  ..  ..  ..  ..  ...  ...  ...  ...
5053681411  5053681411  13  27   54   0   0   0  13  27   54  ...   68    0
5551710191  5551710191  26  37  229   1   3   1  26  37  229  ...   69    1
5551424763  5551424763   1   1    5   0   0   0   1   1    5  ...    7    3
5530988358  5530988358  14  32   58   0   0   0  14  32   58  ...   40    0
5538413098  5538413098   1   1    6   1   1   5   1   1    6  ...    0    0

            140  142  143  144   166   167   168  IS_FRAUD
0
5519642725    0   35   43  204     0     0     0         1
5551089384    2   10   13   73     0     0     0         1
5077374179    1    0    0    0     0     0     0         1
5055496631    0    3    5    3     0     0     0         0
5338188555    1   12   14   72     0     0     0         1
...         ...  ...  ...  ...   ...   ...   ...       ...
5053681411    0   13   51   89   727   822  1234         0
5551710191    3   13   20  159     0     0     0         1
5551424763    3    3    3    8  1727  1796   595         1
5530988358    0   14   22   35   818   399   169         0
5538413098    0    1    1    6     0     0     0         1

[21636 rows x 80 columns]
In [10]:
main_graph(full_4,'Fraud Dağılımı',"Non-Fraud            &&&            Fraud")
Sütun Sayısı: 80
Satır Sayısı: 21636
1    10818
0    10818
Name: IS_FRAUD, dtype: int64
In [11]:
rb_scaler = RobustScaler()
full_5 = rb_scale(full_4)
print(full_5)
                     0        13        14        15   16   17   18        25  \
0
5519642725  5519642725  4.692308  2.730769  8.388889  2.0  4.0  1.0  4.000000
5551089384  5551089384  1.692308  0.961538  3.166667  3.0  3.0  0.0  1.400000
5077374179  5077374179 -0.461538 -0.423077 -0.333333  0.0  0.0  0.0 -0.466667
5055496631  5055496631 -0.153846 -0.192308 -0.240741  0.0  0.0  0.0 -0.200000
5338188555  5338188555  1.000000  0.384615  1.444444  2.0  2.0  0.0  0.800000
...                ...       ...       ...       ...  ...  ...  ...       ...
5053681411  5053681411  0.538462  0.615385  0.666667  0.0  0.0  0.0  0.400000
5551710191  5551710191  1.538462  1.000000  3.907407  1.0  3.0  1.0  1.266667
5551424763  5551424763 -0.384615 -0.384615 -0.240741  0.0  0.0  0.0 -0.400000
5530988358  5530988358  0.615385  0.807692  0.740741  0.0  0.0  0.0  0.466667
5538413098  5538413098 -0.384615 -0.384615 -0.222222  1.0  1.0  5.0 -0.400000

                  26        27  ...       138  139  140     142       143  \
0                               ...
5519642725  2.379310  7.015625  ...  2.267606  0.0  0.0  1.7500  0.805556
5551089384  0.793103  2.609375  ...  1.112676  2.0  2.0  0.1875 -0.027778
5077374179 -0.448276 -0.343750  ... -0.366197  1.0  1.0 -0.4375 -0.388889
5055496631 -0.241379 -0.265625  ... -0.239437  0.0  0.0 -0.2500 -0.250000
5338188555  0.275862  1.156250  ... -0.169014  1.0  1.0  0.3125  0.000000
...              ...       ...  ...       ...  ...  ...     ...       ...
5053681411  0.482759  0.500000  ...  0.591549  0.0  0.0  0.3750  1.027778
5551710191  0.827586  3.234375  ...  0.605634  1.0  3.0  0.3750  0.166667
5551424763 -0.413793 -0.265625  ... -0.267606  3.0  3.0 -0.2500 -0.305556
5530988358  0.655172  0.562500  ...  0.197183  0.0  0.0  0.4375  0.222222
5538413098 -0.413793 -0.250000  ... -0.366197  0.0  0.0 -0.3750 -0.361111

                 144        166       167       168  IS_FRAUD
0
5519642725  2.549296   0.000000  0.000000  0.000000         1
5551089384  0.704225   0.000000  0.000000  0.000000         1
5077374179 -0.323944   0.000000  0.000000  0.000000         1
5055496631 -0.281690   0.000000  0.000000  0.000000         0
5338188555  0.690141   0.000000  0.000000  0.000000         1
...              ...        ...       ...       ...       ...
5053681411  0.929577   9.565789  2.069226  4.140940         0
5551710191  1.915493   0.000000  0.000000  0.000000         1
5551424763 -0.211268  22.723684  4.521082  1.996644         1
5530988358  0.169014  10.763158  1.004405  0.567114         0
5538413098 -0.239437   0.000000  0.000000  0.000000         1

[21636 rows x 80 columns]
In [12]:
compare_boxplot(full_5)