import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix from sklearn.preprocessing import LabelEncoder import joblib
df = pd.read_csv('data.csv')
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist() categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
if 'target' in numeric_features: numeric_features.remove('target') if 'id' in numeric_features: numeric_features.remove('id')
numeric_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('power', PowerTransformer(method='yeo-johnson')), ('scaler', StandardScaler()), ])
categorical_transformer = Pipeline([ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=10)), ])
preprocessor = ColumnTransformer([ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features), ], remainder='drop')
pipeline = Pipeline([ ('preprocessor', preprocessor), ('feature_selection', SelectFromModel( RandomForestClassifier(n_estimators=100, random_state=42), threshold='median' )), ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced')), ])
X = df.drop(columns=['target']) y = LabelEncoder().fit_transform(df['target'])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, stratify=y, random_state=42 )
param_grid = { 'classifier__C': np.logspace(-3, 2, 10), 'classifier__penalty': ['l2'], 'feature_selection__threshold': ['median', 'mean'], }
gs = GridSearchCV( pipeline, param_grid, cv=StratifiedKFold(5, shuffle=True, random_state=42), scoring='roc_auc_ovr', n_jobs=-1, verbose=1, refit=True, )
gs.fit(X_train, y_train)
print(f'Best params: {gs.best_params_}') print(f'Best CV score: {gs.best_score_:.4f}')
y_pred = gs.predict(X_test) y_proba = gs.predict_proba(X_test)
print(classification_report(y_test, y_pred)) print(f'ROC-AUC: {roc_auc_score(y_test, y_proba, multi_class="ovr"):.4f}')
joblib.dump(gs.best_estimator_, 'model_pipeline.pkl')
loaded_pipe = joblib.load('model_pipeline.pkl') predictions = loaded_pipe.predict(new_data)
|