pytorch
diff --git a/‎docs/contents.rst
+5-4 b/‎docs/contents.rst
+5-4
diff --git a/‎docs/large_model_inference.md
+80 b/‎docs/large_model_inference.md
+80
diff --git a/‎examples/README.md
+5-3 b/‎examples/README.md
+5-3
diff --git a/‎examples/Huggingface_Largemodels/Download_model.py renamed to ‎examples/large_models/Huggingface_accelerate/Download_model.py b/‎examples/Huggingface_Largemodels/Download_model.py renamed to ‎examples/large_models/Huggingface_accelerate/Download_model.py
diff --git a/‎examples/Huggingface_Largemodels/Readme.md renamed to ‎examples/large_models/Huggingface_accelerate/Readme.md b/‎examples/Huggingface_Largemodels/Readme.md renamed to ‎examples/large_models/Huggingface_accelerate/Readme.md
diff --git a/‎examples/Huggingface_Largemodels/config.properties renamed to ‎examples/large_models/Huggingface_accelerate/config.properties b/‎examples/Huggingface_Largemodels/config.properties renamed to ‎examples/large_models/Huggingface_accelerate/config.properties
diff --git a/‎examples/Huggingface_Largemodels/custom_handler.py renamed to ‎examples/large_models/Huggingface_accelerate/custom_handler.py b/‎examples/Huggingface_Largemodels/custom_handler.py renamed to ‎examples/large_models/Huggingface_accelerate/custom_handler.py
diff --git a/‎examples/Huggingface_Largemodels/requirements.txt renamed to ‎examples/large_models/Huggingface_accelerate/requirements.txt b/‎examples/Huggingface_Largemodels/requirements.txt renamed to ‎examples/large_models/Huggingface_accelerate/requirements.txt
diff --git a/‎examples/Huggingface_Largemodels/sample_text.txt renamed to ‎examples/large_models/Huggingface_accelerate/sample_text.txt b/‎examples/Huggingface_Largemodels/sample_text.txt renamed to ‎examples/large_models/Huggingface_accelerate/sample_text.txt
diff --git a/‎examples/Huggingface_Largemodels/setup_config.json renamed to ‎examples/large_models/Huggingface_accelerate/setup_config.json b/‎examples/Huggingface_Largemodels/setup_config.json renamed to ‎examples/large_models/Huggingface_accelerate/setup_config.json
diff --git a/‎examples/deepspeed_mii/Download_deepseed_mii_models.py renamed to ‎examples/large_models/Huggingface_pippy/Download_model.py b/‎examples/deepspeed_mii/Download_deepseed_mii_models.py renamed to ‎examples/large_models/Huggingface_pippy/Download_model.py
diff --git a/‎examples/large_models/Huggingface_pippy/Readme.md
+82 b/‎examples/large_models/Huggingface_pippy/Readme.md
+82
diff --git a/‎examples/large_models/Huggingface_pippy/model-config.yaml
+21 b/‎examples/large_models/Huggingface_pippy/model-config.yaml
+21
diff --git a/‎examples/large_models/Huggingface_pippy/pippy_handler.py
+131 b/‎examples/large_models/Huggingface_pippy/pippy_handler.py
+131
diff --git a/‎examples/large_models/Huggingface_pippy/requirements.txt
+2 b/‎examples/large_models/Huggingface_pippy/requirements.txt
+2
diff --git a/‎examples/large_models/Huggingface_pippy/sample_text.txt
+1 b/‎examples/large_models/Huggingface_pippy/sample_text.txt
+1
diff --git a/‎examples/deepspeed_mii/DeepSpeed_mii_handler.py renamed to ‎examples/large_models/deepspeed_mii/DeepSpeed_mii_handler.py b/‎examples/deepspeed_mii/DeepSpeed_mii_handler.py renamed to ‎examples/large_models/deepspeed_mii/DeepSpeed_mii_handler.py
@@ -3,7 +3,7 @@
   :numbered:
   :caption: Contents:
   :titlesonly:
-  
+
   index
   Troubleshooting
   batch_inference_with_ts
@@ -23,14 +23,15 @@
   torchserve_on_wsl
   use_cases
   workflows
+  large_model_inference
 
 .. toctree::
   :maxdepth: 0
   :caption: Service APIs:
-  
+
   apis
 
 .. toctree::
   :caption: Developer APIs:
-  
-  api/dev_api
+
+  api/dev_api
@@ -0,0 +1,80 @@
+# Serving large models with Torchserve
+
+This document explain how Torchserve supports large model serving, here large model refers to the models that are not able to fit into one gpu so they need be split in multiple partitions over multiple gpus.
+
+## PiPPy (PyTorch Native solution for large model inference)
+
+PiPPy provides pipeline parallelism for serving large models that would not fit into one gpu. It takes your model and splits it into equal sizes (stages) partitioned over the number devices you specify. Then uses microbatching to run your batched input for inference ( its is more optimal for batch sizes >1).
+
+
+## How to use PiPPy in Torchserve
+
+To use Pippy in Torchserve, we need to use a custom handler which inherits from base_pippy_handler and put our setting in model-config.yaml.
+
+Customer handler in Torchserve is simply a python script that defines model loading, preprocess, inference and postprocess logic specific to your workflow.
+
+It would look like below:
+
+Create `custom_handler.py` or any other descriptive name.
+
+```python
+#DO import the necessary packages along with following
+from ts.torch_handler.distributed.base_pippy_handler import BasePippyHandler
+from ts.handler_utils.distributed.pt_pippy import initialize_rpc_workers, get_pipline_driver
+class ModelHandler(BasePippyHandler, ABC):
+    def __init__(self):
+        super(ModelHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        model = # load your model from model_dir
+        self.device = self.local_rank %  torch.cuda.device_count()# being used to move model inputs to (self.device)
+        self.model = get_pipline_driver(model,self.world_size, ctx)
+
+```
+
+Here is what your `model-config.yaml` needs, this config file is very flexible, you can add setting related to frontend, backend and handler.
+
+```bash
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+parallelLevel: 4
+deviceType: "gpu"
+parallelType: "pp" #options depending on the solution, pp(pipeline parallelism), tp(tensor parallelism), pptp ( pipeline and tensor parallelism)
+                   # This will be used to route input to either rank0 or all ranks from fontend based on the solution (e.g. DeepSpeed support tp, PiPPy support pp)
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                       # gpus you wish to split your model
+#backend settings
+pippy:
+    chunks: 1 # This sets the microbatch sizes, microbatch = batch size/ chunks
+    input_names: ['input_ids'] # input arg names to the model, this is required for FX tracing
+    model_type: "HF" # set the model type to HF if you are using Huggingface model other wise leave it blank or any other model you use.
+    rpc_timeout: 1800
+    num_worker_threads: 512 #set number of threads for rpc worker init.
+
+handler:
+    max_length: 80 # max length of tokens for tokenizer in the handler
+```
+
+**How to access it in the handler?** here is an example:
+
+```python
+def initialize(self, ctx):
+    model_type = ctx.model_yaml_config["pippy"]["model_type"]
+
+```
+
+The rest is as usual in Torchserve, basically packaging your model and starting the server.
+
+Example of the command for packaging your model, make sure you pass model-config.yaml
+
+```bash
+torch-model-archiver --model-name bloom --version 1.0 --handler pippy_handler.py --extra-files --extra-files $MODEL_CHECKPOINTS_PATH -r requirements.txt --config-file model-config.yaml --archive-format tgz
+
+```
+
+Tensor Parallel support in progress and will be added as soon as ready.
@@ -25,6 +25,8 @@
 
 * [Serving HuggingFace transformers model](Huggingface_Transformers)
 
+### PiPPy [Serving Large Models with PyTorch Native Solution PiPPy](large_models/Huggingface_pippy/Readme.md)
+
 ### MLFlow <img src="images/mlflow.png" width="50"  title="MLFlow" style="float:right padding:20px" />
 
 * [Deploy models using `mlflow-torchserve` plugin](https://github.com/mlflow/mlflow-torchserve/tree/master/examples)
@@ -43,7 +45,7 @@
 
 ### Microsoft DeepSpeed-MII <img src="images/mii-white.svg" width="80" title="DeepSpeed MII" style="float:top" />
 
-* [HuggingFace Stable Diffusion Model with Microsoft DeepSpeed-MII](deepspeed_mii)
+* [HuggingFace Stable Diffusion Model with Microsoft DeepSpeed-MII](large_models/deepspeed_mii/Readme.md)
 
 ### Prometheus and mtail <img src="images/prometheus-logo.svg" width="30" title="Prometheus" style="float:top" />
 
@@ -66,8 +68,8 @@
 ### Stable Diffusion <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
 * [Stable Diffusion using HuggingFace Diffusers](diffusers)
 
-### HuggingFace Large Models <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
-* [HuggingFace Large Models with constrained resources](Huggingface_Largemodels)
+### HuggingFace Large Models with Accelerate <img src="images/huggingface_logo-noborder.svg" width="30" height="30" title="Hugging Face" style="float:right padding:10px" />
+* [HuggingFace Large Models with constrained resources](large_models/Huggingface_accelerate/Readme.md)
 
 ## UseCases
 
 
@@ -0,0 +1,82 @@
+# Loading large Huggingface models with PiPPy (PyTorch Native Large inference solution)
+
+This document briefs on serving large HF model with PiPPy.
+
+PiPPy provides pipeline parallelism for serving large models that would not fit into one gpu. It takes your model and splits it into equal sizes (stages) partitioned over the number devices you specify. Then uses micro batching to run your batched input for inference ( its is more optimal for batch sizes >1). Micro-batching is the techniques in pipeline parallelism to maximize gpu utilization.
+
+## How to serve your large HuggingFace models with PiPPy in Torchserve?
+
+We use a Torchserve custom handler that inherits from base_pippy_handler to load the model and define our logic for preprocess, inference and post processing. This is basically very similar to your evaluation process.
+
+### Step 1: Download model
+
+Login into huggingface hub with token by running the below command
+
+```bash
+huggingface-cli login
+```
+paste the token generated from huggingface hub.
+
+```bash
+python Download_model.py --model_name facebook/opt-6.7b
+```
+The script prints the path where the model is downloaded as below. This is an example and in your workload you want to use your actual trained model checkpoints.
+
+`model/models--bigscience-bloom-7b1/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/`
+
+The downloaded model is around 14GB.
+
+
+### Step 2: Create a model-config.yaml with that include following
+
+```bash
+
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+parallelLevel: 4
+deviceType: "gpu"
+parallelType: "pp" #PiPPy as the solution for distributed inference
+torchrun:
+    nproc-per-node: 4 # specifies the number of processes torchrun starts to serve your model, set to world_size or number of
+                       # gpus you wish to split your model
+pippy:
+    chunks: 1 # This sets the microbatch sizes, microbatch = batch size/ chunks
+    input_names: ['input_ids'] # input arg names to the model, this is required for FX tracing
+    model_type: "HF" # set the model type to HF if you are using Huggingface model other wise leave it blank or any other model you use.
+    rpc_timeout: 1800
+
+handler:
+    max_length: 80 # max length of tokens for tokenizer in the handler
+```
+
+### Step 3: Generate Tar/ MAR file
+
+Navigate up to `Huggingface_Largemodels` directory.
+
+```bash
+torch-model-archiver --model-name bloom --version 1.0 --handler pippy_handler.py --extra-files model/models--facebook--opt-iml-max-1.3b/snapshots/d60fa58f50def19751da2075791da359ca19d273  -r requirements.txt --config-file model-config.yaml --archive-format tgz
+
+```
+
+### Step 4: Add the mar file to model store
+
+```bash
+mkdir model_store
+mv bloom.mar model_store
+```
+
+### Step 5: Start torchserve
+
+Update config.properties and start torchserve
+
+```bash
+torchserve --ncs --start --model-store model_store --models bloom.mar
+```
+
+### Step 6: Run inference
+
+```bash
+curl -v "http://localhost:8080/predictions/bloom" -T sample_text.txt
+```
@@ -0,0 +1,21 @@
+#frontend settings
+minWorkers: 1
+maxWorkers: 1
+maxBatchDelay: 100
+responseTimeout: 120
+parallelType: "pp"
+deviceType: "gpu"
+torchrun:
+    nproc-per-node: 4
+
+#backend settings
+pippy:
+    rpc_timeout: 1800
+    model_type: "HF"
+    chunks: 1
+    input_names: ["input_ids"]
+    num_worker_threads: 512
+
+handler:
+    max_length: 50
+    manual_seed: 40
@@ -0,0 +1,131 @@
+import logging
+import time
+from abc import ABC
+
+import requests
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.handler_utils.distributed.pt_pippy import get_pipeline_driver
+from ts.torch_handler.distributed.base_pippy_handler import BasePippyHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+
+
+class TransformersSeqClassifierHandler(BasePippyHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(TransformersSeqClassifierHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        """In this initialize function, the HF large model is loaded and
+        partitioned into multiple stages each on one device using PiPPy.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artefacts parameters.
+        """
+        super().initialize(ctx)
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+        self.device = self.local_rank
+
+        seed = ctx.model_yaml_config["handler"]["manual_seed"]
+        torch.manual_seed(seed)
+
+        self.model = AutoModelForCausalLM.from_pretrained(model_dir, use_cache=False)
+
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir, return_tensors="pt")
+
+        self.max_length = ctx.model_yaml_config["handler"]["max_length"]
+
+        logger.info("Instantiating model Pipeline")
+        model_init_start = time.time()
+        self.model = get_pipeline_driver(self.model, self.world_size, ctx)
+
+        logger.info("Transformer model from path %s loaded successfully", model_dir)
+
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch, attention_mask_batch = [], []
+        for input_text in input_texts:
+            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+            attention_mask_batch.append(attention_mask)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.device)
+        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
+        return input_ids_batch, attention_mask_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            pad_to_max_length=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            attention_mask=attention_mask_batch,
+            max_length=30,
+        )
+
+        inferences = [
+            self.tokenizer.batch_decode(
+                outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+            )
+        ]
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output
@@ -0,0 +1,2 @@
+transformers
+
@@ -0,0 +1 @@
+Hey, are you conscious? Can you talk to me?
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hey, are you conscious? Can you talk to me?`