ml/4_1_hyperopt_parallel_example.py¶

import os

from hyperopt import hp
from sklearn.datasets import fetch_covtype
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

from astrodata.ml.metrics.SklearnMetric import SklearnMetric
from astrodata.ml.model_selection.HyperOptSelector_parallel import (
    HyperOptSelectorParallel,
)
from astrodata.ml.models.SklearnModel import SklearnModel

# todo le cancelliamo?  Sono vuote, anche il logfile

# directory for mongo workers, set it to find its files
# (if you want the training to run in parallel, otherwise you don't  need it)
path_workers_job = "testdata/MongoDB_workers_jobs"
os.makedirs(path_workers_job, exist_ok=True)
os.chdir(path_workers_job)


# Instantiate the SklearnModel with LinearSVC and a metric

model_list = ["LinearSVC"]

model_mapping = {
    "LinearSVC": SklearnModel(model_class=LinearSVC),
    # add other models
}


accuracy = SklearnMetric(accuracy_score)
f1 = SklearnMetric(f1_score, average="micro")
metrics = [accuracy, f1]  # please choose accordingly to the model, or leave it None


# set it equal to 1 to use Cross Validation, 0 to avoid it
use_cv = 0


# load your data

data = fetch_covtype()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)


# set ypur param space

param_space = {
    "model": hp.choice("model", model_list),
    "C": hp.choice("C", [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 150]),
    "max_iter": hp.choice("max_iter", [100, 150, 200, 250, 500, 750, 1000]),
    "tol": hp.choice("tol", [1e-2, 5e-3, 1e-3, 5e-4, 1e-4]),
    "fit_intercept": hp.choice("fit_intercept", [True, False]),
    "class_weight": hp.choice("class_weight", [None, "balanced"]),
}


# available  cores on your machine
n_cores_available = os.cpu_count()
print("You have", n_cores_available, "cores available on this machine.")


# choose n_core_choosen = 1 if you do NOT want to parallelize on multiple cores.
# If you want to parallelize the training, please install MongoDB and set the mongo_url

n_cores_chosen = -1


# IMPORTANT: Use "mongo://" as the protocol, not "mongodb://"
# example URL format: mongo://localhost:27017/database_name.collection_name


mongo_url = "mongo://localhost:27017/hyperopt_db/jobs"


# todo nella versione finale bisogna eliminare il tratto basso in HyperOptSelector_
#  che al momento c'è per poter chiamare sia HyperOptSelector in parallelo che
#  quello del codice vecchio senza imbrogliarsi avendo due HyperOptSelector con
#  lo stesso identico nome


if use_cv == 1:
    hos = HyperOptSelectorParallel(
        n_cores=n_cores_chosen,
        mongo_url=mongo_url,
        show_worker_terminal=False,
        # ^ False to avoid seeing n terminal windows opening suddenly,
        # | but if you have problems, they can be informative
        param_space=param_space,
        model_mapping=model_mapping,
        scorer=accuracy,
        use_cv=True,
        cv=5,
        random_state=42,
        max_evals=100,  # You can increase this for a more thorough search
        metrics=metrics,
    )

else:
    hos = HyperOptSelectorParallel(
        n_cores=n_cores_chosen,
        mongo_url=mongo_url,
        show_worker_terminal=False,
        # ^ False to avoid seeing n terminal windows opening suddenly,
        # | but if you have problems, they can be informative
        param_space=param_space,
        model_mapping=model_mapping,  # se no da None
        scorer=accuracy,
        use_cv=False,
        cv=5,
        random_state=42,
        max_evals=100,  # You can increase this for a more thorough search
        metrics=metrics,
    )


hos.fit(X_train, y_train)

print(f"Best parameters found:", hos.get_best_params())
print(f"Best metrics:", hos.get_best_metrics())
print(f"Best metrics:", hos.get_best_model().get_params())