Skip to content

feat: split process_data out from run_training #438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,8 @@ for training jobs. There are a number of options you can specify, such as settin
| Field | Description |
| --- | --- |
| model_path | Either a reference to a HuggingFace repo or a path to a model saved in the HuggingFace format. |
| data_path | A path to the `.jsonl` training dataset. This is expected to be in the messages format. |
| data_path | A path to the `.jsonl` training dataset. This is expected to be processed (post filtering/tokenization/masking). |
| ckpt_output_dir | Directory where trained model checkpoints will be saved. |
| data_output_dir | Directory where the processed training data is stored (post filtering/tokenization/masking) |
| max_seq_len | The maximum sequence length to be included in the training set. Samples exceeding this length will be dropped. |
| max_batch_len | Maximum tokens per gpu for each batch that will be handled in a single step. Used as part of the multipack calculation. If running into out-of-memory errors, try to lower this value, but not below the `max_seq_len`. |
| num_epochs | Number of epochs to run through before stopping. |
Expand Down Expand Up @@ -281,7 +280,7 @@ training_args = TrainingArgs(
model_path = "ibm-granite/granite-7b-base",
data_path = "path/to/dataset.jsonl",
ckpt_output_dir = "data/saved_checkpoints",
data_output_dir = "data/outputs",
data_path = "data/outputs/data.jsonl",

# define model-trianing parameters
max_seq_len = 4096,
Expand Down Expand Up @@ -335,13 +334,14 @@ from instructlab.training import (
DataProcessArgs,
data_process as dp
)
import os
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, Opinion, Not required: in some codebases I worked with, it's common to import os.path instead of os to give a hint that we care about the path functions specifically and not just any os stuff.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@booxter or better yet, from os import path 😉


training_args = TrainingArgs(
# define data-specific arguments
model_path = "ibm-granite/granite-7b-base",
data_path = "path/to/dataset.jsonl",
ckpt_output_dir = "data/saved_checkpoints",
data_output_dir = "data/outputs",
data_path = "data/outputs/data.jsonl",

# define model-trianing parameters
max_seq_len = 4096,
Expand All @@ -352,12 +352,11 @@ training_args = TrainingArgs(
learning_rate = 2e-6,
warmup_steps = 800,
random_seed = 42,
process_data = True,
)
...

data_process_args = DataProcessArgs(
data_output_path = training_args.data_output_dir,
data_output_path = os.path.dirname(training_args.data_path),
model_path = training_args.model_path,
data_path = training_args.data_path,
max_seq_len = training_args.max_seq_len,
Expand Down
19 changes: 14 additions & 5 deletions examples/01_building_a_reasoning_model.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,10 @@
"metadata": {},
"outputs": [],
"source": [
"from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions\n",
"from instructlab.training.main_ds import run_training"
"from instructlab.training.config import TorchrunArgs,TrainingArgs,DistributedBackend,FSDPOptions,DataProcessArgs\n",
"from instructlab.training.main_ds import run_training\n",
"from instructlab.training.data_process import process_data as dp\n",
"import os"
]
},
{
Expand Down Expand Up @@ -166,7 +168,7 @@
"\tmodel_path=\"microsoft/Phi-4-mini-instruct\",\n",
"\tdata_path=\"nemotron.jsonl\",\n",
"\tckpt_output_dir=\"experiments/training_output\",\n",
"\tdata_output_dir=\"data/processed-data\", # processed data ids/labels/masks\n",
"\tdata_path=\"data/processed-data/data.jsonl\", # processed data ids/labels/masks\n",
"\tmax_seq_len=20000,\n",
"\tmax_batch_len=30000, # max tokens per gpu\n",
"\tnum_epochs=3, \n",
Expand All @@ -176,17 +178,23 @@
" save_samples=0, # save ckpt after num of samples seen (0=off)\n",
" checkpoint_at_epoch = True, # save ckpt after every epoch\n",
" accelerate_full_state_at_epoch = False, # save full-state for resuming\n",
" process_data=True, # can set to false if data processed before\n",
"\tdistributed_backend=DistributedBackend.FSDP,\n",
"\tfsdp_options=FSDPOptions(cpu_offload_params=False),\n",
")\n",
"data_process_args = DataProcessArgs(\n",
" data_output_path = os.path.dirname(train_args.data_path),\n",
" model_path = train_args.model_path,\n",
" data_path = train_args.data_path,\n",
" max_seq_len = train_args.max_seq_len,\n",
" chat_tmpl_path = train_args.chat_tmpl_path\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally, we kick off SFT via the run_training function:"
"Finally, we process the data and then kick off SFT via the run_training function:"
]
},
{
Expand All @@ -195,6 +203,7 @@
"metadata": {},
"outputs": [],
"source": [
"dp.main(data_process_args)\n",
"run_training(torch_args=torch_args,train_args=train_args)"
]
},
Expand Down
10 changes: 2 additions & 8 deletions src/instructlab/training/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,10 @@ class TrainingArgs(BaseModel):
# this field determines if ibm_legacy_tmpl should be used instead
use_legacy_tmpl: bool = False

# this field specifies the filepath to the training dataset before processing
# this field specifies the filepath to the training dataset
data_path: str
ckpt_output_dir: str

# this field defines where we should be saving the processed version of the training dataset
# after we have tokenized it
data_output_dir: str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIU instructlab CLI currently uses this field to pass --data-output-dir from the user. It also assumes the default behavior - process_data being True.

Would it be possible / desirable to make the transition smoother, e.g. by

  1. marking data_output_dir field as deprecated;
  2. if it's passed, direct users with a warning to use the data_path name instead (plus not assume pre-processing?).

Or is the strategy here is to rewrite CLI + bump the minimal version for training library? Even if the CLI training rewrite for the new interface is the plan, it would be easier on the remaining CLI maintainers if the change happens through gradual deprecation.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if the CLI training rewrite for the new interface is the plan, it would be easier on the remaining CLI maintainers if the change happens through gradual deprecation

I'm not sure how this is handled in the current refactor @cdoern is working on. Assuming it doesn't handle this case, this is how I recommend we handle it based on how we made the initial refactor to move to the newer version of data processing:

  1. Create a new API for what is currently run_training which only runs training, we can either call it run_training_v2 or something else.
  2. Keep run_training with a deprecation notice and have it still perform this orchestration under the hood
  3. Refactor ilab to use the new APIs
  4. Remove run_training after a few releases as you have described here

ckpt_output_dir: str

max_seq_len: int
max_batch_len: int
Expand Down Expand Up @@ -207,9 +204,6 @@ class TrainingArgs(BaseModel):
# quantize_dtype: QuantizeDataType = QuantizeDataType.NONE
lora: LoraOptions | None = None

# This field defines whether or not data processing will occur inside of `run_training()`
process_data: Optional[bool] = True

# This field specifies whether only the last checkpoint should be retained. When set to true, it
# will overwrite the previous checkpoint directory, keeping only one directory called
# "last_epoch". This works alongside the '--checkpoint_at_epoch' flag.
Expand Down
17 changes: 1 addition & 16 deletions src/instructlab/training/main_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@
set_random_seed,
setup_logger,
)
import instructlab.training.data_process as dp


def setup_optimizer(args, model):
Expand Down Expand Up @@ -669,20 +668,6 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
os.path.dirname(__file__), "chat_templates/ibm_legacy_tmpl.py"
)

if train_args.process_data:
# TODO(osilkin):
# Decouple the data processing logic from training.
# Now that we've decided that repos will be less tethered to the
# design choices of the `ilab` CLI, we can make this change.
dp.process_data(
data_output_path=train_args.data_output_dir,
model_path=train_args.model_path,
data_path=train_args.data_path,
max_seq_len=train_args.max_seq_len,
chat_tmpl_path=train_args.chat_tmpl_path,
num_cpu_procs=train_args.data_process_num_cpu_procs,
)

if not os.path.exists(train_args.ckpt_output_dir):
os.makedirs(train_args.ckpt_output_dir, exist_ok=True)

Expand All @@ -695,7 +680,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
f"--rdzv_endpoint={torch_args.rdzv_endpoint}",
__file__,
f"--model_name_or_path={train_args.model_path}",
f"--data_path={train_args.data_output_dir}/data.jsonl",
f"--data_path={train_args.data_path}",
f"--output_dir={train_args.ckpt_output_dir}",
f"--num_epochs={train_args.num_epochs}",
f"--effective_batch_size={train_args.effective_batch_size}",
Expand Down
Loading