Examples#

1-phase random forest classifier#

setup#

load packages:

>>> import pandas as pd
>>> import numpy as np
>>> from yaml import load
>>> from yaml import CLoader as Loader
>>> from planktonsdm.tune import tune
>>> from planktonsdm.predict import predict
>>> from planktonsdm.post import post
>>> from planktonsdm.functions import example_data

load configuration yaml:

>>> with open('/home/phyto/planktonSDM/configuration/example_model_config.yml', 'r') as f:
...     model_config = load(f, Loader=Loader)
>>> print(model_config)
  {'root': '/home/phyto/CoccoML/',
  'path_out': 'ModelOutput/',
  'traits': '/home/phyto/CoccoML/data/traits.csv',
  'scale_X': True,
  'verbose': 1,
  'seed': 1,
  'n_threads': 2,
  'cv': 3,
  'predict_probability': False,
  'ensemble_config': {
        'regressor': False,
        'classifier': True,
        'm1': 'rf'},
  'upsample': False,
  'clf_scoring': {
        'accuracy':
        'balanced_accuracy'}
  'param_grid': {
        'rf_param_grid': {
                'clf_param_grid': {
                        'n_estimators': [100],
                        'max_features': [3, 4],
                        'max_depth': [50, 100],
                        'min_samples_leaf': [0.5, 1],
                        'max_samples': [0.5, 1]}}}
  }

create example count data:

>>> X, y = example_data(y_name =  "Coccolithus pelagicus", n_samples=500, n_features=5,
...     noise=20, random_state=model_config['seed'])

create example envdata

>>> envdata = pd.DataFrame({"no3": rand(50), "mld": rand(50), "par": rand(50), "o2": rand(50),
...     "temp": rand(50), "lat": range(0,50, 1), "lon": 50*[1]})
>>> envdata.set_index(['lat', 'lon'], inplace=True)

train#

>>> m = tune(X, y, model_config)
>>> m.train(model="rf", classifier=True)

predict#

>>> envdata = pd.DataFrame(X)
>>> m = predict(X, y, envdata, model_config)
>>> m.make_prediction()

post#

>>> m = post(model_config)
>>> m.merge_env()
>>> m.export_ds(file_name = "1-phase_rf")

2-phase XGBoost regressor#

setup#

load configuration yaml:

>>> with open('/home/phyto/planktonSDM/configuration/example_model_config.yml', 'r') as f:
...     model_config = load(f, Loader=Loader)
>>> print(model_config)
  {'root': '/home/phyto/CoccoML/',
  'path_out': 'ModelOutput/',
  'traits': '/home/phyto/CoccoML/data/traits.csv',
  'scale_X': True,
  'verbose': 1,
  'seed': 1,
  'n_threads': 2,
  'cv': 3,
  'predict_probability': False,
  'ensemble_config': {
        'regressor': True,
        'classifier': True,
        'm2': 'xgb'},
  'upsample': False,
  'clf_scoring': {
        'accuracy':
        'balanced_accuracy'},
  'reg_scoring': {
        'R2': 'r2',
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error'},
  'param_grid': {
        'xgb_param_grid': {
                'clf_param_grid': {
                        'eta': [0.01],
                        'n_estimators': [100],
                        'max_depth': [4],
                        'subsample': [0.6],
                        'colsample_bytree': [0.6],
                        'gamma': [1],
                        'alpha': [1]},
                'reg_param_grid': {
                        'regressor__eta': [0.01],
                        'regressor__n_estimators': [100],
                        'regressor__max_depth': [4],
                        'regressor__subsample': [0.6],
                        'regressor__colsample_bytree': [0.6],
                        'regressor__gamma': [1],
                        'regressor__alpha': [1]}}},

train#

>>> m = tune(X, y, model_config)
>>> m.train(model="xgb", classifier=True, regressor=True)

predict#

>>> envdata = pd.DataFrame(X)
>>> m = predict(X, y, envdata, model_config)
>>> m.make_prediction()

post#

>>> m = post(model_config)
>>> m.merge_env()
>>> m.export_ds(file_name = "2-phase_xgboost")

1-phase ensemble regression#

setup#

load configuration yaml:

>>> with open('/home/phyto/planktonSDM/configuration/example_model_config.yml', 'r') as f:
...     model_config = load(f, Loader=Loader)
>>> print(model_config)
  {'root': '/home/phyto/CoccoML/',
  'path_out': 'ModelOutput/',
  'traits': '/home/phyto/CoccoML/data/traits.csv',
  'scale_X': True,
  'verbose': 1,
  'seed': 1,
  'n_threads': 2,
  'cv': 3,
  'predict_probability': False,
  'ensemble_config': {
        'regressor': False,
        'classifier': True,
        'm1': 'rf',
        'm2': 'xgb',
        'm3': 'knn'},
  'upsample': False,
  'reg_scoring': {
        'R2': 'r2',
        'MAE': 'neg_mean_absolute_error',
        'RMSE': 'neg_root_mean_squared_error'},
  'param_grid': {
        'rf_param_grid': {
                'reg_param_grid': {
                        'regressor__n_estimators': [100],
                        'regressor__max_features': [3, 4],
                        'regressor__max_depth': [50, 100],
                        'regressor__min_samples_leaf': [0.5, 1],
                        'regressor__max_samples': [0.5, 1]},
        'xgb_param_grid': {
                'reg_param_grid': {
                        'regressor__eta': [0.01],
                        'regressor__n_estimators': [100],
                        'regressor__max_depth': [4],
                        'regressor__subsample': [0.6],
                        'regressor__colsample_bytree': [0.6],
                        'regressor__gamma': [1],
                        'regressor__alpha': [1]}},
        'knn_param_grid': {
                'reg_param_grid': {
                        'regressor__max_samples': [0.5],
                        'regressor__max_features': [0.5],
                        'regressor__estimator__leaf_size': [30],
                        'regressor__estimator__n_neighbors': [3],
                        'regressor__estimator__p': [1],
                        'regressor__estimator__weights': ['uniform']}}},
  'knn_bagging_estimators': 30}

train#

>>> m = tune(X, y, model_config)
>>> m.train(model="rf", regressor=True)
>>> m.train(model="knn", regressor=True)
>>> m.train(model="xgb", regressor=True)

predict#

>>> envdata = pd.DataFrame(X)
>>> m = predict(X, y, envdata, model_config)
>>> m.make_prediction()

post#

>>> m = post(model_config)
>>> m.merge_env()
>>> m.export_ds(file_name = "1-phase_ensemble_regression")