提问人:Phil Crom 提问时间:11/15/2023 更新时间:11/15/2023 访问量:6
通过大量测试进行分类
Classification with huge testing
问:
我目前正在尝试解决一个分类问题,该问题稍后应该在模拟环境中运行。这意味着它从多个事件中获取数据,并应将数据实时分配到一个类或另一个类中。仿真将在仿真时间内运行1d,然后在此基础上评估算法。 对于算法的训练部分,我使用一天中的不同部分,因为一天有 6400 万个数据点。我只使用了 350 万(代表?),否则训练将持续很长时间,并且数据将无法转移。问题在于,当将 350 万个数据点分离为训练集和测试集(AUC 高达 0.9)时,训练的分类器非常好,但在完整的模拟环境中使用时却非常糟糕。误报率是真阳性率的 10 倍。
目前,我正在尝试安装SVM,MLP和LSTM。这是我的代码:
DataLoader:
Just load Data without any preprocessing or something like that
Data shape when LSTM is used:
X = (35000000,20,24) -> All dense
y = (35000000,)
Data shape else:
X = (35000000,24) -> All dense
y = (35000000,)
NetworkLibrary:
from sklearn.svm import SVC
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import *
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.metrics import BinaryAccuracy
# from keras.wrappers.scikit_learn import KerasClassifier
import numpy as np
import sys
import random
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(suppress=True)
class NetworkLibrary:
def __init__(self, seed) -> None:
random.seed(seed)
np.random.seed(seed)
def get_trained_SVM(self, X_train, y_train, gamma, C):
print("Starting machine learning SVM with")
print("gamma: " + str(gamma))
print("C: " + str(C))
clf = SVC(gamma=gamma, C=C, verbose=1, probability=True)
clf.fit(X_train, y_train)
return clf
def get_trained_MLP(
self, X_train, y_train, hidden_layer, max_iter_no_change, max_iter
):
print("Starting machine learning MLP with")
print("Hidden layer: " + str(hidden_layer))
print(
"Maximal iterations without change in loss of 0.001: "
+ str(max_iter_no_change)
)
print("Maximal iterations of network training runs: " + str(max_iter))
clf = MLPClassifier(
hidden_layer_sizes=hidden_layer,
n_iter_no_change=max_iter_no_change,
max_iter=max_iter,
verbose=True,
solver="adam",
random_state=1,
)
clf.fit(X_train, y_train)
return clf
def get_trained_LSTM(self, X_train, y_train, lstm_sizes, epochs):
print("Starting machine learning LSTM with")
print("LSTM layers: " + str(lstm_sizes))
print("Number of epochs: " + str(epochs))
clf = Sequential()
lstm_size = len(lstm_sizes)
(n_sample, n_features, time_steps) = np.shape(X_train)
print(np.shape(y_train))
for i in range(lstm_size):
cur_size = lstm_sizes[i]
if i == 0 and 1 == lstm_size:
clf.add(
LSTM(
cur_size,
return_sequences=True,
input_shape=(n_features, time_steps),
)
)
clf.add(LSTM(cur_size, return_sequences=False))
break
elif i == 0:
clf.add(
LSTM(
cur_size,
return_sequences=True,
input_shape=(n_features, time_steps),
)
)
elif i == (lstm_size - 1):
clf.add(LSTM(cur_size, return_sequences=False))
else:
clf.add(LSTM(cur_size, return_sequences=True))
clf.add(Dense(1))
clf.compile(
# Maybe here from_logits is not such a good alternative
loss=BinaryCrossentropy(from_logits=True),
optimizer=Adam(),
metrics=[BinaryAccuracy()],
)
clf.fit(X_train, y_train, epochs=epochs)
return clf
MainClass:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from NetworkTester import NetworkTester
from NetworkLibrary import NetworkLibrary
from DataLoader import *
import numpy as np
import joblib
import sys
import random
import math
random.seed(10)
np.random.seed(10)
np.set_printoptions(threshold=sys.maxsize)
np.set_printoptions(suppress=True)
# train_len = 1 / 10
train_len = 2 / 3
print_classes_and_coefs = False
max_grad = 4
if __name__ == "__main__":
model = sys.argv[1]
partition = sys.argv[2]
negative_grad = -1 * math.log10(float(partition))
ml_grad = max_grad - int(negative_grad)
ml_grad_with_start = ml_grad + 2
savePath = "/home/a/adamczykp/tmp/f2md-training/saveFile"
library = NetworkLibrary(10)
print("Starting...")
if model.isdigit():
model = int(model)
else:
model = model_name_to_number(model_name=model)
X, y = collect_data_portion(model=model, check_version=1, partition=partition)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=train_len, random_state=2, stratify=y
)
class_weight = np.flip(compute_class_weight("balanced", classes=[0, 1], y=y))
if model == 0:
clf = library.get_trained_SVM(X_train, y_train, 0.001, 100.0)
joblib.dump(clf, savePath + "/clfs/clf_SVM_SINGLE_" + partition + ".pkl")
elif model == 1:
hidden_layer = tuple([128 for _ in range(ml_grad)])
n_iter_no_change = 500 * ml_grad
max_iter = 15000 * ml_grad
clf = library.get_trained_MLP(
X_train, y_train, hidden_layer, n_iter_no_change, max_iter
)
joblib.dump(clf, savePath + "/clfs/clf_MLP_SINGLE_L1N25_" + partition + ".pkl")
elif model == 2:
hidden_layer = tuple([128 for _ in range(ml_grad_with_start)])
n_iter_no_change = 500 * ml_grad_with_start
max_iter = 15000 * ml_grad_with_start
clf = library.get_trained_MLP(
X_train, y_train, hidden_layer, n_iter_no_change, max_iter
)
joblib.dump(clf, savePath + "/clfs/clf_MLP_SINGLE_L3N25_" + partition + ".pkl")
elif model == 3:
hidden_layer = [128 for _ in range(ml_grad)]
epochs = 500 * ml_grad
clf = library.get_trained_LSTM(X_train, y_train, hidden_layer, epochs)
joblib.dump(
clf, savePath + "/clfs/clf_LSTM_RECURRENT_TIN_" + partition + ".pkl"
)
elif model == 4:
hidden_layer = [128 for _ in range(ml_grad_with_start)]
epochs = 500 * ml_grad_with_start
clf = library.get_trained_LSTM(X_train, y_train, hidden_layer, epochs)
joblib.dump(clf, savePath + "/clfs/clf_LSTM_RECURRENT_BIG_" + ".pkl")
else:
raise ValueError("Not a suitable model selected")
if print_classes_and_coefs:
print("Classes: " + str(clf.classes_) + "\n")
print("Loss: " + str(round(clf.loss_, 4)) + "\n")
for i in range(len(clf.coefs_)):
print("----------Coef " + str(i) + ":----------\n")
coef = clf.coefs_[i]
print(str(coef) + "\n")
### Evaluate the model below here ###
答: 暂无答案
评论