提问人:Bayan 提问时间:11/17/2023 更新时间:11/17/2023 访问量:29
我的逻辑回归模型有多好?[关闭]
How good is my logistic regression model? [closed]
问:
我正在开发一个简单的逻辑引用 ML 模型,我完成了代码,但我希望有人给我反馈。代码总体上好吗?我该如何改进它?我真的很感激你的时间!
我尝试遵循典型逻辑回归模型中的所有必要步骤,包括数据预处理和将数据拆分为训练/测试。此外,我还考虑了一些评估指标,包括混淆矩阵和 ROC 曲线。
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head()
# Check for the non-numeric column
data.info()
# Check for null
data.isnull().sum()
#Visualize null values
import seaborn as sns
sns.heatmap(data.isna())
# Distribution of Male vs Female and their churn
sns.countplot(x='Churn',data=data,hue='gender')
# Data preprocessing
# Assuming that 'TotalCharges' is a numeric column and has missing values
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce') # Convert to numeric and handle missing values
# Encode categorical variables using Label Encoding
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
label_encoders = {}
for col in categorical_cols:
label_encoders[col] = LabelEncoder()
data[col] = label_encoders[col].fit_transform(data[col])
# Drop the 'customerID' column as it's not a relevant feature for prediction
data = data.drop('customerID', axis=1)
# Handle missing values in numeric columns like 'TotalCharges' with mean imputation
numeric_cols = ['TotalCharges'] # Add other numeric columns if needed
imputer = SimpleImputer(strategy='mean')
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])
# Check for the non-numeric column after preprocessing
data.info()
# Split the data into features (X) and the target variable (y)
X = data.drop('Churn', axis=1)
y = data['Churn']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import StandardScaler
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
data.head()
# Create a logistic regression model
model = LogisticRegression(max_iter=1000)
# Train the model on the training set
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
# Make prediction probabilities on the test set
y_pred_prob = model.predict_proba(X_test)[:, 1] # get the probability for the positive class
y_pred_prob
# Compute ROC curve and calculate AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ROC curve (AUC = {:.3f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--', label='Random classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Logistic Regression')
plt.legend(loc='lower right')
plt.show()
from sklearn.metrics import precision_recall_curve
# Calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
# Plot the precision-recall curve
plt.plot(thresholds, precision[:-1], "g-", label="Precision")
plt.plot(thresholds, recall[:-1], "r-", label="Recall")
plt.xlabel('Threshold')
plt.ylabel('Precision/Recall')
plt.legend()
plt.show()
# Set a threshold
threshold = 0.45
# Convert the probability output to binary output based on the threshold
y_pred = [1 if prob > threshold else 0 for prob in y_pred_prob]
print(y_pred[:])
# Example new data point
new_data = [0, 0, 1, 0, 15, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 2, 40.0, 600.0]
# Convert the new data point to a DataFrame
new_df = pd.DataFrame([new_data])
# Standardize the features (if needed) using the same scaler used during training
# new_data_scaled = scaler.transform(new_df)
# Access the coefficients of the logistic regression model
coefficients = model.coef_[0]
# Calculate the log-odds
log_odds = np.dot(coefficients, new_data)
# Calculate the odds
odds = np.exp(log_odds)
# Make predictions using the trained model
new_prediction = model.predict(new_df)
# Display the prediction
print("Predicted Churn:", new_prediction)
# Display the log-odds and odds
print("Log Odds:", log_odds)
print("Odds:", odds)
# Display the coefficients
print("Coefficients:")
for feature, coefficient in zip(new_df.columns, coefficients):
print(f"{feature}: {coefficient}")
def logit2prob(model, X):
log_odds = np.dot(X, model.coef_.T) + model.intercept_
odds = np.exp(log_odds)
probability = odds / (1 + odds)
return probability
# Get probabilities for the test set
probabilities = logit2prob(model, X_test)
# Display the probabilities
print("Probabilities for the first few samples:")
print(probabilities[:5])
# Use the logit2prob function to get probabilities for the test set
def logit2prob(model, X):
log_odds = np.dot(X, model.coef_.T) + model.intercept_
odds = np.exp(log_odds)
probability = odds / (1 + odds)
return probability
# Get probabilities for the test set
probabilities = logit2prob(model, X_test)
# Instances with predicted probabilities greater than or equal to the threshold will be classified as positive
predictions = (probabilities >= threshold).astype(int) # Binary classification: 0 or 1
# Display the adjusted predictions
print("Adjusted Predictions for the first few samples:")
print(predictions[:5])
# create the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# visualize the confusion matrix using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square=True, cmap='Blues_r')
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix')
plt.show()
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)
# Evaluate the model's performance with Cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=5)
print("Cross-validation scores: ", scores)
print("Mean cross-validation score: ", scores.mean())
答: 暂无答案
上一个:使用英特尔映像分类进行验证提取
下一个:递归样本拆分方案(带网格搜索)
评论