AttributeError: 'numpy.ndarray' object has no attribute 'columns' -> when I didn't even change dataframe into np.ndarry

提问人:PRADDYUMN YADAV 提问时间:11/11/2023


目前,我正在尝试破解 Kaggle 的宇宙飞船泰坦尼克号数据集,当我这样做时,我遇到了一个错误(如下)。在下面的代码中,是从中移除转换要素后的相同数据集。X

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

class Cleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.drop(["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"], axis=1)
        return X

class ObjectEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        columns_to_encode = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
        encoder = OneHotEncoder()
        # Encode each attribute and add the columns to the DataFrame
        for column in columns_to_encode:
            matrix = encoder.fit_transform(X[[column]]).toarray()
            column_names = [f"{column}_{i}" for i in range(matrix.shape[1])]
            for i in range(len(matrix.T)):
                X[column_names[i]] = matrix.T[i]

        # Remove Old Parameters
        X = X.drop(["HomePlanet", "CryoSleep", "Destination", "VIP"], axis=1)

        return X

class LabelYEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        return y_encoded

class NullImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        features_with_missing_values = X.columns[X.isnull().any()].tolist()
        imputer = SimpleImputer(strategy='median')
        X[features_with_missing_values] = imputer.fit_transform(X[features_with_missing_values])
        return X

pipeline = Pipeline([
    ("cleaner", Cleaner()),
    ("object_encoder", ObjectEncoder()),
    ("label_y_encoder", LabelYEncoder()),
    ("null_imputer", NullImputer()),
X = pipeline.fit_transform(strat_train_set)


来自 sklearn-template 文档

请注意,转换仅将 X 作为输入,并应返回 X 的转换版本


class Cleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.drop(["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"], axis=1)
        return X

class ObjectEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        columns_to_encode = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
        encoder = OneHotEncoder()
        # Encode each attribute and add the columns to the DataFrame
        for column in columns_to_encode:
            matrix = encoder.fit_transform(X[[column]]).toarray()
            column_names = [f"{column}_{i}" for i in range(matrix.shape[1])]
            for i in range(len(matrix.T)):
                X[column_names[i]] = matrix.T[i]

        # Remove old parameters
        X = X.drop(["HomePlanet", "CryoSleep", "Destination", "VIP"], axis=1)
        return X

class LabelYEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        return y_encoded

class NullImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        features_with_missing_values = X.columns[X.isnull().any()].tolist()
        imputer = SimpleImputer(strategy="median")
        X[features_with_missing_values] = imputer.fit_transform(X[features_with_missing_values])
        return X

pipeline = Pipeline(
        ("cleaner", Cleaner()),
        ("object_encoder", ObjectEncoder()),
        ("label_y_encoder", LabelYEncoder()),
        ("null_imputer", NullImputer()),

df = pd.read_csv("train.csv")

# "In the code below X is the same dataset after removing the Transformed Feature from it."
# Idk what "Transformed Feature" is, but I assume you mean "Transported"
strat_train_set = df.drop(columns="Transported")

X = pipeline.fit_transform(strat_train_set)
您可以通过编辑 :method: 来接受参数来解决这个问题。但就我个人而言,我会修改您的自定义类以使其更通用,并使用 a 来控制每个步骤的输入。如下所示:LabelYEncoder.transformyColumnTransformer

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder

# Separating `X` and `y`
df = pd.read_csv("train.csv")
X = df.drop(columns="Transported")
y = df.Transported

cols_to_drop = ["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"]
cols_to_one_hot = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
# Remove the `cols_to_drop` and `cols_to_one_hot`.
# The remainder will be passed through via the identity function.
cols_to_keep = X.drop(columns=[*cols_to_drop, *cols_to_one_hot]).columns

prep_trf = ColumnTransformer(
        # One hot encode the `cols_to_one_hot`
        ("one_hot", OneHotEncoder(sparse_output=False), cols_to_one_hot),
        # Impute the `cols_to_keep`
        ("imputer", SimpleImputer(strategy="median"), cols_to_keep),
    # Drop the remaining columns not mentioned in 
    # `cols_to_one_hot` or `cols_to_keep`
    # Transform output to `pandas.DataFrame` (optional)

# Apply the transformations to `X`.
X = prep_trf.fit_transform(X)

调用将处理输入要素的所有预处理。因为只接受输入,不支持 :method:,所以你不能把它添加到对象中。事实上,从技术上讲,如果没有一些自定义代码,就无法将其添加到管道中。相反,您应该独立转换变量,或者完全忽略它(因为它已经是具有布尔值的二进制形式)。prep_trf.fit_transformXLabelEncoderyset_outputprep_trfy

scikit-learn1.1.3 (< 1.2.X)

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelEncoder

# Separating `X` and `y`
df = pd.read_csv("train.csv")
X = df.drop(columns="Transported")
y = df.Transported

cols_to_drop = ["Name", "PassengerId", "FoodCourt", "ShoppingMall", "Cabin"]
cols_to_one_hot = ["HomePlanet", "CryoSleep", "Destination", "VIP"]
# Remove the `cols_to_drop` and `cols_to_one_hot`.
# The remainder will be passed through via the identity function.
cols_to_keep = X.drop(columns=[*cols_to_drop, *cols_to_one_hot]).columns

prep_trf = ColumnTransformer(
        # One hot encode the `cols_to_one_hot`
        # NOTE: had to change the `sparse_output` parameter to `sparse`.
        ("one_hot", OneHotEncoder(sparse=False), cols_to_one_hot),
        # Impute the `cols_to_keep`
        ("imputer", SimpleImputer(strategy="median"), cols_to_keep),
    # Drop the remaining columns not mentioned in 
    # `cols_to_one_hot` or `cols_to_keep`
    # NOTE: :method:`set_output` does not exist in scikit-learn version < 1.2.X

# Apply the transformations to `X`.
X_ = prep_trf.fit_transform(X)

因为 1.1.3 版本没有 :method:,我们需要重新创建自己:scikit-learnset_outputpandas.DataFrame

X_v1_1_3 = pd.DataFrame(
    # Track the index just incase it was modified earlier.
    # This can affect `joins`.
    # Use the :method:`prep_trf_get_feature_names_out` to get the 
    # correct column names.

在这两种情况下,输出将是相同的。在两个版本中分别应用 to。LabelEncodery
