Consider TPOT your data science assistant. TPOT is a Python machine learning tool that optimizes machine learning pipelines using genetic programming.
Considérez TPOT comme votre assistant en science des données . TPOT est un outil d'apprentissage automatique Python qui optimise les pipelines d'apprentissage automatique à l'aide de la programmation génétique.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow
import urllib.request
df_test = pd.read_csv('/content/drive/MyDrive/test.csv')
df = pd.read_csv('/content/drive/MyDrive/train.csv', index_col='PassengerId')
# Séparer les cibles des features
X = df.drop('Survived', axis='columns')
Y = df['Survived']
# Exploratory Data Analysis (EDA)
drop = ['Ticket']
passthrough = ['Pclass', 'SibSp', 'Parch']
num_manquantes = ['Age', 'Fare']
cat_manquantes = ['Embarked']
cat = ['Sex']
text = ['Name', 'Cabin']
# Importer les outils pour faire le preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
# Importer les Algos qu'on veut tester
from sklearn.linear_model import RidgeClassifier # RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RandomForest # RandomForestClassifier est le modèl qu'il faut apprendre à utiliser
# Créer une pipeline
from sklearn.pipeline import Pipeline
# Pipeline de Preprocessing
cat_manquantes_preprocessing = make_pipeline(
SimpleImputer(strategy='most_frequent'),
OneHotEncoder(sparse=False, handle_unknown='ignore')
)
def extraire_la_premiere_lettre(serie):
# Récupère une Série en argument
# Retourne une DataFrame (compatibilité col. Trans)
return pd.DataFrame(serie.str[0])
preprocess_cabin = make_pipeline(
FunctionTransformer(extraire_la_premiere_lettre),
SimpleImputer(strategy='constant', fill_value='MANQUANTE'),
OneHotEncoder(handle_unknown='ignore')
)
preprocessing = make_column_transformer(
(OneHotEncoder(handle_unknown='ignore', sparse=False) , cat),
(cat_manquantes_preprocessing , cat_manquantes),
(SimpleImputer(strategy='median'), num_manquantes),
(CountVectorizer () , 'Name'),
(preprocess_cabin , 'Cabin'),
('passthrough' , passthrough),
('drop', drop)
)
X_cleaned = preprocessing.fit_transform(X, Y)
from sklearn.model_selection import KFold
cross_val_fold = KFold(n_splits=5,
shuffle=True,
random_state=333)
#!pip install tpot
import tpot
from tpot import TPOTClassifier
automl = TPOTClassifier(generations=2, population_size=50,
scoring='balanced_accuracy',
cv=cross_val_fold,
config_dict='TPOT sparse', verbosity=2, random_state=77, n_jobs=-1)
# automl.fit(X_cleaned, Y)
Generation 1 - Current best internal CV score: 0.8148907220160109 Generation 2 - Current best internal CV score: 0.8148907220160109 Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.7500000000000001, min_samples_leaf=4, min_samples_split=7, n_estimators=100)
TPOTClassifier(config_dict='TPOT sparse', crossover_rate=0.1, cv=KFold(n_splits=5, random_state=333, shuffle=True), disable_update_check=False, early_stop=None, generations=2, log_file=None, max_eval_time_mins=5, max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=-1, offspring_size=None, periodic_checkpoint_folder=None, population_size=50, random_state=77, scoring='balanced_accuracy', subsample=1.0, template=None, use_dask=False, verbosity=2, warm_start=False)
# dir(automl)
automl.export('automl.py')
!cat automl.py
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=77) # Average CV score on the training set was: 0.8148907220160109 exported_pipeline = RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.7500000000000001, min_samples_leaf=4, min_samples_split=7, n_estimators=100) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 77) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
rfc = RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.7500000000000001, min_samples_leaf=4, min_samples_split=7, n_estimators=100)
rfc_pipeline = make_pipeline(preprocessing, rfc)
from sklearn.model_selection import cross_val_score
score = cross_val_score(rfc_pipeline, X, Y, cv=cross_val_fold, scoring='balanced_accuracy')
score.mean(), score.std()
(0.8059480239789657, 0.016631960277583645)