ml/1_sklearn_example.py¶

import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    mean_squared_log_error,
    r2_score,
)
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR

from astrodata.ml.metrics import SklearnMetric
from astrodata.ml.models import SklearnModel

if __name__ == "__main__":

    # This example demonstrates how to use SklearnModel (or any astrodata.ml.models.BaseMlModel for that matter) for bsic tasks

    # Here we use the diabetes dataset, which is a regression dataset, skleadn.datasets.load_diabetes takes care of loading the data
    # and splitting it into features (X) and target (y). We then split the data into training and test sets using train_test_split from sklearn.model_selection.

    data = load_diabetes()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    y = pd.Series(data.target)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )

    # After loading the data, we instantiate a SklearnModel with the desired model class.
    # In this case, we use LinearSVR from sklearn.svm, which is a support vector regression model.
    # We also set a random state for reproducibility.
    # SklearnModel is a wrapper around sklearn models that provides compatibility with the astrodata.ml framework,
    # allowing you to use it seamlessly with the rest of the astrodata.ml ecosystem.
    # You can use any sklearn model class here, such as LinearRegression, RandomForestRegressor, etc.

    model = SklearnModel(model_class=LinearSVR, random_state=42)

    print(f"Model instantiated: {model}")

    # We can define the metrics we want to use for evaluation.
    # SklearnMetric is a wrapper around sklearn metrics that provides compatibility with the astrodata.ml framework.
    # Here we define several metrics commonly used for regression tasks, the greater_is_better parameter indicates whether a higher score is better for that metric.
    # For example, for mean_absolute_error, a lower value is better, so we set greater_is_better=False.

    mae = SklearnMetric(mean_absolute_error, greater_is_better=False)
    mse = SklearnMetric(mean_squared_error)
    r2 = SklearnMetric(r2_score, greater_is_better=True)
    msle = SklearnMetric(mean_squared_log_error)

    metrics = [mae, mse, r2, msle]

    # Now we can fit the model to the training data using the fit method.

    model.fit(X_train, y_train)

    # The predict method returns the predicted values for the test set.
    # preds = model.predict(X_test)
    # Here we show the metrics for the test set computed using the get_metrics method.

    metrics = model.get_metrics(
        X_test,
        y_test,
        metrics=metrics,
    )

    print(f"Metrics on test set: {metrics}")