通过大量测试进行分类

Classification with huge testing

提问人:Phil Crom 提问时间:11/15/2023 更新时间:11/15/2023 访问量:6

问:

我目前正在尝试解决一个分类问题,该问题稍后应该在模拟环境中运行。这意味着它从多个事件中获取数据,并应将数据实时分配到一个类或另一个类中。仿真将在仿真时间内运行1d,然后在此基础上评估算法。 对于算法的训练部分,我使用一天中的不同部分,因为一天有 6400 万个数据点。我只使用了 350 万(代表?),否则训练将持续很长时间,并且数据将无法转移。问题在于,当将 350 万个数据点分离为训练集和测试集(AUC 高达 0.9)时,训练的分类器非常好,但在完整的模拟环境中使用时却非常糟糕。误报率是真阳性率的 10 倍。

目前,我正在尝试安装SVM,MLP和LSTM。这是我的代码:

DataLoader:
Just load Data without any preprocessing or something like that
Data shape when LSTM is used:
X = (35000000,20,24)         -> All dense
y = (35000000,)

Data shape else:
X = (35000000,24)            -> All dense
y = (35000000,)
NetworkLibrary:

from sklearn.svm import SVC
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.metrics import BinaryAccuracy

# from keras.wrappers.scikit_learn import KerasClassifier
import numpy as np
import sys
import random

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(suppress=True)


class NetworkLibrary:
    def __init__(self, seed) -> None:
        random.seed(seed)
        np.random.seed(seed)

    def get_trained_SVM(self, X_train, y_train, gamma, C):
        print("Starting machine learning SVM with")
        print("gamma: " + str(gamma))
        print("C: " + str(C))
        clf = SVC(gamma=gamma, C=C, verbose=1, probability=True)
        clf.fit(X_train, y_train)
        return clf

    def get_trained_MLP(
        self, X_train, y_train, hidden_layer, max_iter_no_change, max_iter
    ):
        print("Starting machine learning MLP with")
        print("Hidden layer: " + str(hidden_layer))
        print(
            "Maximal iterations without change in loss of 0.001: "
            + str(max_iter_no_change)
        )
        print("Maximal iterations of network training runs: " + str(max_iter))
        clf = MLPClassifier(
            hidden_layer_sizes=hidden_layer,
            n_iter_no_change=max_iter_no_change,
            max_iter=max_iter,
            verbose=True,
            solver="adam",
            random_state=1,
        )
        clf.fit(X_train, y_train)
        return clf

    def get_trained_LSTM(self, X_train, y_train, lstm_sizes, epochs):
        print("Starting machine learning LSTM with")
        print("LSTM layers: " + str(lstm_sizes))
        print("Number of epochs: " + str(epochs))
        clf = Sequential()
        lstm_size = len(lstm_sizes)
        (n_sample, n_features, time_steps) = np.shape(X_train)
        print(np.shape(y_train))
        for i in range(lstm_size):
            cur_size = lstm_sizes[i]
            if i == 0 and 1 == lstm_size:
                clf.add(
                    LSTM(
                        cur_size,
                        return_sequences=True,
                        input_shape=(n_features, time_steps),
                    )
                )
                clf.add(LSTM(cur_size, return_sequences=False))
                break
            elif i == 0:
                clf.add(
                    LSTM(
                        cur_size,
                        return_sequences=True,
                        input_shape=(n_features, time_steps),
                    )
                )
            elif i == (lstm_size - 1):
                clf.add(LSTM(cur_size, return_sequences=False))
            else:
                clf.add(LSTM(cur_size, return_sequences=True))
        clf.add(Dense(1))
        clf.compile(
            # Maybe here from_logits is not such a good alternative
            loss=BinaryCrossentropy(from_logits=True),
            optimizer=Adam(),
            metrics=[BinaryAccuracy()],
        )
        clf.fit(X_train, y_train, epochs=epochs)
        return clf
MainClass:

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

from NetworkTester import NetworkTester
from NetworkLibrary import NetworkLibrary
from DataLoader import *

import numpy as np
import joblib
import sys
import random
import math


random.seed(10)
np.random.seed(10)

np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(suppress=True)

# train_len = 1 / 10
train_len = 2 / 3
print_classes_and_coefs = False
max_grad = 4

if __name__ == "__main__":
    model = sys.argv[1]
    partition = sys.argv[2]
    negative_grad = -1 * math.log10(float(partition))
    ml_grad = max_grad - int(negative_grad)
    ml_grad_with_start = ml_grad + 2
    savePath = "/home/a/adamczykp/tmp/f2md-training/saveFile"
    library = NetworkLibrary(10)
    print("Starting...")
    if model.isdigit():
        model = int(model)
    else:
        model = model_name_to_number(model_name=model)
    X, y = collect_data_portion(model=model, check_version=1, partition=partition)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=train_len, random_state=2, stratify=y
    )
    class_weight = np.flip(compute_class_weight("balanced", classes=[0, 1], y=y))
    if model == 0:
        clf = library.get_trained_SVM(X_train, y_train, 0.001, 100.0)
        joblib.dump(clf, savePath + "/clfs/clf_SVM_SINGLE_" + partition + ".pkl")
    elif model == 1:
        hidden_layer = tuple([128 for _ in range(ml_grad)])
        n_iter_no_change = 500 * ml_grad
        max_iter = 15000 * ml_grad
        clf = library.get_trained_MLP(
            X_train, y_train, hidden_layer, n_iter_no_change, max_iter
        )
        joblib.dump(clf, savePath + "/clfs/clf_MLP_SINGLE_L1N25_" + partition + ".pkl")
    elif model == 2:
        hidden_layer = tuple([128 for _ in range(ml_grad_with_start)])
        n_iter_no_change = 500 * ml_grad_with_start
        max_iter = 15000 * ml_grad_with_start
        clf = library.get_trained_MLP(
            X_train, y_train, hidden_layer, n_iter_no_change, max_iter
        )
        joblib.dump(clf, savePath + "/clfs/clf_MLP_SINGLE_L3N25_" + partition + ".pkl")
    elif model == 3:
        hidden_layer = [128 for _ in range(ml_grad)]
        epochs = 500 * ml_grad
        clf = library.get_trained_LSTM(X_train, y_train, hidden_layer, epochs)
        joblib.dump(
            clf, savePath + "/clfs/clf_LSTM_RECURRENT_TIN_" + partition + ".pkl"
        )
    elif model == 4:
        hidden_layer = [128 for _ in range(ml_grad_with_start)]
        epochs = 500 * ml_grad_with_start
        clf = library.get_trained_LSTM(X_train, y_train, hidden_layer, epochs)
        joblib.dump(clf, savePath + "/clfs/clf_LSTM_RECURRENT_BIG_" + ".pkl")
    else:
        raise ValueError("Not a suitable model selected")
    if print_classes_and_coefs:
        print("Classes: " + str(clf.classes_) + "\n")
        print("Loss: " + str(round(clf.loss_, 4)) + "\n")
        for i in range(len(clf.coefs_)):
            print("----------Coef " + str(i) + ":----------\n")
            coef = clf.coefs_[i]
            print(str(coef) + "\n")
    ### Evaluate the model below here ###
大数据 分类 模拟

评论


答: 暂无答案