我的逻辑回归模型有多好?[关闭]

How good is my logistic regression model? [closed]

提问人:Bayan 提问时间:11/17/2023 更新时间:11/17/2023 访问量:29

问:


想改进这个问题吗?更新问题,以便可以通过编辑这篇文章用事实和引文来回答。

2天前关闭。

我正在开发一个简单的逻辑引用 ML 模型,我完成了代码,但我希望有人给我反馈。代码总体上好吗?我该如何改进它?我真的很感激你的时间!

我尝试遵循典型逻辑回归模型中的所有必要步骤,包括数据预处理和将数据拆分为训练/测试。此外,我还考虑了一些评估指标,包括混淆矩阵和 ROC 曲线。

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()

# Check for the non-numeric column
data.info()

# Check for null
data.isnull().sum()

#Visualize null values
import seaborn as sns
sns.heatmap(data.isna())

# Distribution of Male vs Female and their churn
sns.countplot(x='Churn',data=data,hue='gender')

# Data preprocessing
# Assuming that 'TotalCharges' is a numeric column and has missing values
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')  # Convert to numeric and handle missing values

# Encode categorical variables using Label Encoding
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

label_encoders = {}
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Drop the 'customerID' column as it's not a relevant feature for prediction
data = data.drop('customerID', axis=1)

# Handle missing values in numeric columns like 'TotalCharges' with mean imputation
numeric_cols = ['TotalCharges']  # Add other numeric columns if needed
imputer = SimpleImputer(strategy='mean')
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

# Check for the non-numeric column after preprocessing
data.info()

# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
data.head()

# Create a logistic regression model
model = LogisticRegression(max_iter=1000)

# Train the model on the training set
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

# Make prediction probabilities on the test set
y_pred_prob = model.predict_proba(X_test)[:, 1] # get the probability for the positive class

y_pred_prob

# Compute ROC curve and calculate AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.3f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')
plt.legend(loc='lower right')
plt.show()

from sklearn.metrics import precision_recall_curve

# Calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# Plot the precision-recall curve
plt.plot(thresholds, precision[:-1], "g-", label="Precision")
plt.plot(thresholds, recall[:-1], "r-", label="Recall")
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.legend()
plt.show()

# Set a threshold
threshold = 0.45

# Convert the probability output to binary output based on the threshold
y_pred = [1 if prob > threshold else 0 for prob in y_pred_prob]

print(y_pred[:])

# Example new data point
new_data = [0, 0, 1, 0, 15, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 2, 40.0, 600.0]

# Convert the new data point to a DataFrame
new_df = pd.DataFrame([new_data])

# Standardize the features (if needed) using the same scaler used during training
# new_data_scaled = scaler.transform(new_df)

# Access the coefficients of the logistic regression model
coefficients = model.coef_[0]

# Calculate the log-odds
log_odds = np.dot(coefficients, new_data)

# Calculate the odds
odds = np.exp(log_odds)

# Make predictions using the trained model
new_prediction = model.predict(new_df)

# Display the prediction
print("Predicted Churn:", new_prediction)

# Display the log-odds and odds
print("Log Odds:", log_odds)
print("Odds:", odds)

# Display the coefficients
print("Coefficients:")
for feature, coefficient in zip(new_df.columns, coefficients):
    print(f"{feature}: {coefficient}")
    
    def logit2prob(model, X):
    log_odds = np.dot(X, model.coef_.T) + model.intercept_
    odds = np.exp(log_odds)
    probability = odds / (1 + odds)
    return probability

# Get probabilities for the test set
probabilities = logit2prob(model, X_test)

# Display the probabilities
print("Probabilities for the first few samples:")
print(probabilities[:5])

# Use the logit2prob function to get probabilities for the test set
def logit2prob(model, X):
    log_odds = np.dot(X, model.coef_.T) + model.intercept_
    odds = np.exp(log_odds)
    probability = odds / (1 + odds)
    return probability

# Get probabilities for the test set
probabilities = logit2prob(model, X_test)

# Instances with predicted probabilities greater than or equal to the threshold will be classified as positive
predictions = (probabilities >= threshold).astype(int)  # Binary classification: 0 or 1

# Display the adjusted predictions
print("Adjusted Predictions for the first few samples:")
print(predictions[:5])

# create the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# visualize the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square=True, cmap='Blues_r')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

# Evaluate the model's performance with Cross-validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
Python 机器学习 人工智能 逻辑回归

评论


答: 暂无答案