diff --git a/transformers_doc/en/hpo_transformers_hpo_examples_en.ipynb b/transformers_doc/en/hpo_transformers_hpo_examples_en.ipynb new file mode 100644 index 00000000..88f7936c --- /dev/null +++ b/transformers_doc/en/hpo_transformers_hpo_examples_en.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5a74135", + "metadata": {}, + "source": [ + "# Hyperparameter Search Examples with 🤗 Transformers\n", + "This notebook demonstrates how to use various hyperparameter optimization backends (Optuna, Ray Tune, SigOpt, W&B) with 🤗 Transformers' `Trainer`." + ] + }, + { + "cell_type": "markdown", + "id": "2d6a084f", + "metadata": {}, + "source": [ + "## Setup\n", + "Before running the examples below, make sure to install all required packages:\n", + "```bash\n", + "pip install transformers optuna \"ray[tune]\" sigopt wandb datasets\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "4cf648e4", + "metadata": {}, + "source": [ + "## Data & Model Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f40952", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer\n", + "\n", + "# Load a small subset of SST-2\n", + "dataset = load_dataset(\"glue\", \"sst2\", split=\"train[:200]\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + "\n", + "def preprocess(examples):\n", + " return tokenizer(examples[\"sentence\"], truncation=True, padding=\"max_length\")\n", + "\n", + "dataset = dataset.map(preprocess, batched=True).train_test_split(test_size=0.2)\n", + "\n", + "def model_init():\n", + " return AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a04ce98b", + "metadata": {}, + "source": [ + "## Common Objective Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56a66516", + "metadata": {}, + "outputs": [], + "source": [ + "# Single-objective: minimize eval_loss\n", + "def compute_objective(metrics):\n", + " return metrics[\"eval_loss\"]\n" + ] + }, + { + "cell_type": "markdown", + "id": "c79d4e0f", + "metadata": {}, + "source": [ + "## 1. Optuna Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d08078c0", + "metadata": {}, + "outputs": [], + "source": [ + "import optuna\n", + "from transformers.integrations import EarlyStoppingCallback\n", + "\n", + "def optuna_hp_space(trial):\n", + " return {\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-6, 1e-4, log=True),\n", + " \"weight_decay\": trial.suggest_float(\"weight_decay\", 0.0, 0.3),\n", + " \"num_train_epochs\": trial.suggest_int(\"num_train_epochs\", 1, 3),\n", + " \"per_device_train_batch_size\": trial.suggest_categorical(\"per_device_train_batch_size\", [16, 32, 64]),\n", + " \"warmup_steps\": trial.suggest_int(\"warmup_steps\", 0, 100),\n", + " }\n", + "\n", + "training_args = TrainingArguments(\"optuna-hpo\", evaluation_strategy=\"epoch\", logging_steps=10)\n", + "\n", + "trainer = Trainer(\n", + " args=training_args,\n", + " train_dataset=dataset[\"train\"],\n", + " eval_dataset=dataset[\"test\"],\n", + " tokenizer=tokenizer,\n", + " model_init=model_init,\n", + " compute_metrics=lambda p: {\"eval_loss\": p.loss},\n", + " callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],\n", + ")\n", + "\n", + "best_trial = trainer.hyperparameter_search(\n", + " direction=\"minimize\",\n", + " backend=\"optuna\",\n", + " hp_space=optuna_hp_space,\n", + " n_trials=5,\n", + " compute_objective=compute_objective,\n", + ")\n", + "\n", + "print(\"Best Optuna trial:\", best_trial)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8ff17dd", + "metadata": {}, + "source": [ + "## 2. Ray Tune Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1c10b3b", + "metadata": {}, + "outputs": [], + "source": [ + "from ray import tune\n", + "from ray.tune.schedulers import ASHAScheduler\n", + "from ray.tune.search.hyperopt import HyperOptSearch\n", + "\n", + "def ray_hp_space(trial_config):\n", + " return {\n", + " \"learning_rate\": trial_config[\"learning_rate\"],\n", + " \"per_device_train_batch_size\": trial_config[\"per_device_train_batch_size\"],\n", + " \"num_train_epochs\": trial_config[\"num_train_epochs\"],\n", + " }\n", + "\n", + "ray_search_space = {\n", + " \"learning_rate\": tune.loguniform(1e-5, 1e-3),\n", + " \"per_device_train_batch_size\": tune.choice([16, 32, 64]),\n", + " \"num_train_epochs\": tune.choice([2, 3, 4]),\n", + "}\n", + "\n", + "training_args = TrainingArguments(\"ray-hpo\", evaluation_strategy=\"epoch\", logging_steps=10)\n", + "\n", + "trainer = Trainer(\n", + " args=training_args,\n", + " train_dataset=dataset[\"train\"],\n", + " eval_dataset=dataset[\"test\"],\n", + " tokenizer=tokenizer,\n", + " model_init=model_init,\n", + " compute_metrics=lambda p: {\n", + " \"eval_loss\": p.loss,\n", + " \"eval_accuracy\": (p.predictions.argmax(-1) == p.label_ids).mean()\n", + " },\n", + ")\n", + "\n", + "best_run = trainer.hyperparameter_search(\n", + " direction=\"maximize\",\n", + " backend=\"ray\",\n", + " hp_space=ray_hp_space,\n", + " n_trials=5,\n", + " search_alg=HyperOptSearch(metric=\"eval_accuracy\", mode=\"max\"),\n", + " scheduler=ASHAScheduler(metric=\"eval_accuracy\", mode=\"max\", max_t=3),\n", + " resources_per_trial={\"cpu\": 1, \"gpu\": 0},\n", + " compute_objective=lambda metrics: metrics[\"eval_accuracy\"],\n", + ")\n", + "\n", + "print(\"Best Ray Tune run:\", best_run)\n" + ] + }, + { + "cell_type": "markdown", + "id": "1236b11b", + "metadata": {}, + "source": [ + "## 3. SigOpt Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfd6cf35", + "metadata": {}, + "outputs": [], + "source": [ + "def sigopt_hp_space(trial):\n", + " return [\n", + " {\"bounds\": {\"min\": 1e-6, \"max\": 1e-4}, \"name\": \"learning_rate\", \"type\": \"double\"},\n", + " {\"bounds\": {\"min\": 0.0, \"max\": 0.3}, \"name\": \"weight_decay\", \"type\": \"double\"},\n", + " {\"categorical_values\": [\"16\", \"32\", \"64\"], \"name\": \"per_device_train_batch_size\", \"type\": \"categorical\"},\n", + " {\"bounds\": {\"min\": 1, \"max\": 3}, \"name\": \"num_train_epochs\", \"type\": \"int\"},\n", + " ]\n", + "\n", + "best_trials = trainer.hyperparameter_search(\n", + " direction=[\"minimize\", \"maximize\"],\n", + " backend=\"sigopt\",\n", + " hp_space=sigopt_hp_space,\n", + " n_trials=5,\n", + " compute_objective=lambda m: (m[\"eval_loss\"], m[\"eval_accuracy\"])\n", + ")\n", + "\n", + "print(\"Best SigOpt trials:\", best_trials)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b1cb72e0", + "metadata": {}, + "source": [ + "## 4. Weights & Biases (W&B) Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd86a741", + "metadata": {}, + "outputs": [], + "source": [ + "import wandb\n", + "\n", + "def wandb_hp_space(trial):\n", + " return {\n", + " \"method\": \"random\",\n", + " \"metric\": {\"name\": \"eval_loss\", \"goal\": \"minimize\"},\n", + " \"parameters\": {\n", + " \"learning_rate\": {\"distribution\": \"uniform\", \"min\": 1e-6, \"max\": 1e-4},\n", + " \"per_device_train_batch_size\": {\"values\": [16, 32, 64]},\n", + " \"num_train_epochs\": {\"values\": [1, 2, 3]},\n", + " },\n", + " }\n", + "\n", + "best_runs = trainer.hyperparameter_search(\n", + " direction=\"minimize\",\n", + " backend=\"wandb\",\n", + " hp_space=wandb_hp_space,\n", + " n_trials=5,\n", + " compute_objective=compute_objective,\n", + ")\n", + "\n", + "print(\"Best W&B runs:\", best_runs)\n" + ] + }, + { + "cell_type": "markdown", + "id": "3ab87274", + "metadata": {}, + "source": [ + "**End of examples.**\n", + "\n", + "You can adjust `n_trials`, early stopping, objective functions, and other settings to suit your specific task." + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transformers_doc/hpo_transformers_hpo_examples_en.ipynb b/transformers_doc/hpo_transformers_hpo_examples_en.ipynb new file mode 100644 index 00000000..593a369d --- /dev/null +++ b/transformers_doc/hpo_transformers_hpo_examples_en.ipynb @@ -0,0 +1,276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a5a74135", + "metadata": {}, + "source": [ + "# Hyperparameter Search Examples with 🤗 Transformers\n", + "This notebook demonstrates how to use various hyperparameter optimization backends (Optuna, Ray Tune, SigOpt, W&B) with 🤗 Transformers' `Trainer`." + ] + }, + { + "cell_type": "markdown", + "id": "2d6a084f", + "metadata": {}, + "source": [ + "## Setup\n", + "Before running the examples below, make sure to install all required packages:\n", + "```bash\n", + "pip install transformers optuna \"ray[tune]\" sigopt wandb datasets\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "4cf648e4", + "metadata": {}, + "source": [ + "## Data & Model Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8f40952", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer\n", + "\n", + "# Load a small subset of SST-2\n", + "dataset = load_dataset(\"glue\", \"sst2\", split=\"train[:200]\")\n", + "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n", + "\n", + "def preprocess(examples):\n", + " return tokenizer(examples[\"sentence\"], truncation=True, padding=\"max_length\")\n", + "\n", + "dataset = dataset.map(preprocess, batched=True).train_test_split(test_size=0.2)\n", + "\n", + "def model_init():\n", + " return AutoModelForSequenceClassification.from_pretrained(\"distilbert-base-uncased\", num_labels=2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a04ce98b", + "metadata": {}, + "source": [ + "## Common Objective Function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56a66516", + "metadata": {}, + "outputs": [], + "source": [ + "# Single-objective: minimize eval_loss\n", + "def compute_objective(metrics):\n", + " return metrics[\"eval_loss\"]\n" + ] + }, + { + "cell_type": "markdown", + "id": "c79d4e0f", + "metadata": {}, + "source": [ + "## 1. Optuna Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d08078c0", + "metadata": {}, + "outputs": [], + "source": [ + "import optuna\n", + "from transformers.integrations import EarlyStoppingCallback\n", + "\n", + "def optuna_hp_space(trial):\n", + " return {\n", + " \"learning_rate\": trial.suggest_float(\"learning_rate\", 1e-6, 1e-4, log=True),\n", + " \"weight_decay\": trial.suggest_float(\"weight_decay\", 0.0, 0.3),\n", + " \"num_train_epochs\": trial.suggest_int(\"num_train_epochs\", 1, 3),\n", + " \"per_device_train_batch_size\": trial.suggest_categorical(\"per_device_train_batch_size\", [16, 32, 64]),\n", + " \"warmup_steps\": trial.suggest_int(\"warmup_steps\", 0, 100),\n", + " }\n", + "\n", + "training_args = TrainingArguments(\"optuna-hpo\", evaluation_strategy=\"epoch\", logging_steps=10)\n", + "\n", + "trainer = Trainer(\n", + " args=training_args,\n", + " train_dataset=dataset[\"train\"],\n", + " eval_dataset=dataset[\"test\"],\n", + " tokenizer=tokenizer,\n", + " model_init=model_init,\n", + " compute_metrics=lambda p: {\"eval_loss\": p.loss},\n", + " callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],\n", + ")\n", + "\n", + "best_trial = trainer.hyperparameter_search(\n", + " direction=\"minimize\",\n", + " backend=\"optuna\",\n", + " hp_space=optuna_hp_space,\n", + " n_trials=5,\n", + " compute_objective=compute_objective,\n", + ")\n", + "\n", + "print(\"Best Optuna trial:\", best_trial)\n" + ] + }, + { + "cell_type": "markdown", + "id": "a8ff17dd", + "metadata": {}, + "source": [ + "## 2. Ray Tune Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1c10b3b", + "metadata": {}, + "outputs": [], + "source": [ + "from ray import tune\n", + "from ray.tune.schedulers import ASHAScheduler\n", + "from ray.tune.search.hyperopt import HyperOptSearch\n", + "\n", + "def ray_hp_space(trial_config):\n", + " return {\n", + " \"learning_rate\": trial_config[\"learning_rate\"],\n", + " \"per_device_train_batch_size\": trial_config[\"per_device_train_batch_size\"],\n", + " \"num_train_epochs\": trial_config[\"num_train_epochs\"],\n", + " }\n", + "\n", + "ray_search_space = {\n", + " \"learning_rate\": tune.loguniform(1e-5, 1e-3),\n", + " \"per_device_train_batch_size\": tune.choice([16, 32, 64]),\n", + " \"num_train_epochs\": tune.choice([2, 3, 4]),\n", + "}\n", + "\n", + "training_args = TrainingArguments(\"ray-hpo\", evaluation_strategy=\"epoch\", logging_steps=10)\n", + "\n", + "trainer = Trainer(\n", + " args=training_args,\n", + " train_dataset=dataset[\"train\"],\n", + " eval_dataset=dataset[\"test\"],\n", + " tokenizer=tokenizer,\n", + " model_init=model_init,\n", + " compute_metrics=lambda p: {\n", + " \"eval_loss\": p.loss,\n", + " \"eval_accuracy\": (p.predictions.argmax(-1) == p.label_ids).mean()\n", + " },\n", + ")\n", + "\n", + "best_run = trainer.hyperparameter_search(\n", + " direction=\"maximize\",\n", + " backend=\"ray\",\n", + " hp_space=ray_hp_space,\n", + " n_trials=5,\n", + " search_alg=HyperOptSearch(metric=\"eval_accuracy\", mode=\"max\"),\n", + " scheduler=ASHAScheduler(metric=\"eval_accuracy\", mode=\"max\", max_t=3),\n", + " resources_per_trial={\"cpu\": 1, \"gpu\": 0},\n", + " compute_objective=lambda metrics: metrics[\"eval_accuracy\"],\n", + ")\n", + "\n", + "print(\"Best Ray Tune run:\", best_run)\n" + ] + }, + { + "cell_type": "markdown", + "id": "1236b11b", + "metadata": {}, + "source": [ + "## 3. SigOpt Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfd6cf35", + "metadata": {}, + "outputs": [], + "source": [ + "def sigopt_hp_space(trial):\n", + " return [\n", + " {\"bounds\": {\"min\": 1e-6, \"max\": 1e-4}, \"name\": \"learning_rate\", \"type\": \"double\"},\n", + " {\"bounds\": {\"min\": 0.0, \"max\": 0.3}, \"name\": \"weight_decay\", \"type\": \"double\"},\n", + " {\"categorical_values\": [\"16\", \"32\", \"64\"], \"name\": \"per_device_train_batch_size\", \"type\": \"categorical\"},\n", + " {\"bounds\": {\"min\": 1, \"max\": 3}, \"name\": \"num_train_epochs\", \"type\": \"int\"},\n", + " ]\n", + "\n", + "best_trials = trainer.hyperparameter_search(\n", + " direction=[\"minimize\", \"maximize\"],\n", + " backend=\"sigopt\",\n", + " hp_space=sigopt_hp_space,\n", + " n_trials=5,\n", + " compute_objective=lambda m: (m[\"eval_loss\"], m[\"eval_accuracy\"])\n", + ")\n", + "\n", + "print(\"Best SigOpt trials:\", best_trials)\n" + ] + }, + { + "cell_type": "markdown", + "id": "b1cb72e0", + "metadata": {}, + "source": [ + "## 4. Weights & Biases (W&B) Example" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd86a741", + "metadata": {}, + "outputs": [], + "source": [ + "import wandb\n", + "\n", + "def wandb_hp_space(trial):\n", + " return {\n", + " \"method\": \"random\",\n", + " \"metric\": {\"name\": \"eval_loss\", \"goal\": \"minimize\"},\n", + " \"parameters\": {\n", + " \"learning_rate\": {\"distribution\": \"uniform\", \"min\": 1e-6, \"max\": 1e-4},\n", + " \"per_device_train_batch_size\": {\"values\": [16, 32, 64]},\n", + " \"num_train_epochs\": {\"values\": [1, 2, 3]},\n", + " },\n", + " }\n", + "\n", + "best_runs = trainer.hyperparameter_search(\n", + " direction=\"minimize\",\n", + " backend=\"wandb\",\n", + " hp_space=wandb_hp_space,\n", + " n_trials=5,\n", + " compute_objective=compute_objective,\n", + ")\n", + "\n", + "print(\"Best W&B runs:\", best_runs)\n" + ] + }, + { + "cell_type": "markdown", + "id": "3ab87274", + "metadata": {}, + "source": [ + "**End of examples.**\n", + "\n", + "You can adjust `n_trials`, early stopping, objective functions, and other settings to suit your specific task." + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}