From ee3b51d4d98f4b359129d6980990d43c55e050e3 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 11 Oct 2024 11:22:28 +0200
Subject: [PATCH 01/50] test commit

---
 hls4ml/converters/keras_to_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 00561e6ba8..e0f2a79c02 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -325,6 +325,6 @@ def parse_keras_model(model_arch, reader):
 def keras_to_hls(config):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
-    print('Creating HLS model')
+    print('Creating HLS model...')
     hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
     return hls_model

From cbeee246289ebc4212766843ba815129d01a7d7c Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 11 Oct 2024 17:56:54 +0200
Subject: [PATCH 02/50] split ModelGraph at specified layer name

---
 hls4ml/converters/keras_to_hls.py | 39 +++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index e0f2a79c02..518e959995 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -322,9 +322,44 @@ def parse_keras_model(model_arch, reader):
     return layer_list, input_layers, output_layers, output_shapes
 
 
-def keras_to_hls(config):
+def keras_to_hls(config, split_layer_name = 'fc2'):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
+    
     print('Creating HLS model...')
-    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
+    if split_layer_name is not None:
+        
+        if 'conv' not in split_layer_name and 'fc' not in split_layer_name:
+            raise ValueError(f"Split layer must be done on conv or dense layers")
+
+        # Find the index of the split layer in layer_list
+        split_index = next((i for i, layer in enumerate(layer_list) if layer['name'] == split_layer_name), None)
+        if split_index is None:
+            raise ValueError(f"Layer '{split_layer_name}' not found in the model. Split must be done on conv or dense layers")
+        
+        # Split layer_list into two parts
+        layer_list1 = layer_list[:split_index]  
+        layer_list2 = layer_list[split_index:]  # Include the split layer in the second subgraph
+
+        print(layer_list[0], layer_list2[0], layer_list[-1])
+
+        # Create a new input layer for the second subgraph
+        input_layer_dict = {
+            'name': layer_list2[0]['name']+'_input',
+            'class_name': 'InputLayer',
+            'data_format': 'channels_last',
+            'input_shape': [layer_list2[0]['n_in']],
+        }
+
+        # Insert the new input layer at the beginning of layer_list2
+        layer_list2.insert(0, input_layer_dict)
+
+        # Create two ModelGraphs
+        hls_model1 = ModelGraph(config, layer_list1, None, None)
+        hls_model2 = ModelGraph(config, layer_list2, None, None)
+
+        return hls_model1, hls_model2
+
+    else:
+        hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
     return hls_model

From 03111c9f03460e05d61dd112175e264e07bb2abd Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 14 Oct 2024 17:45:29 +0200
Subject: [PATCH 03/50] feat: add make_multi_graph classmethod to ModelGraph

- The method returns two instances of the `ModelGraph` class.
- Each instance is initialized with the same config, only output folder changes, allowing separate models to be created in one call.
- This improves usability by simplifying the process of generating multiple graphs from a single configuration input.
---
 hls4ml/converters/__init__.py     |  5 ++-
 hls4ml/converters/keras_to_hls.py | 35 +++-----------------
 hls4ml/model/graph.py             | 53 ++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 693a76f666..0ba8e9dfdf 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -214,7 +214,10 @@ def convert_from_keras_model(
 
     _check_hls_config(config, hls_config)
 
-    return keras_to_hls(config)
+    # Retrieve 'split_layer_name' from kwargs, if provided, for multi-graph creation
+    split_layer_name = kwargs.get('split_layer_name', None)
+
+    return keras_to_hls(config, split_layer_name=split_layer_name)
 
 
 @requires('_torch')
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 518e959995..e1948b690d 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -322,44 +322,17 @@ def parse_keras_model(model_arch, reader):
     return layer_list, input_layers, output_layers, output_shapes
 
 
-def keras_to_hls(config, split_layer_name = 'fc2'):
+def keras_to_hls(config, split_layer_name = None):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     
     print('Creating HLS model...')
     if split_layer_name is not None:
-        
         if 'conv' not in split_layer_name and 'fc' not in split_layer_name:
-            raise ValueError(f"Split layer must be done on conv or dense layers")
-
-        # Find the index of the split layer in layer_list
-        split_index = next((i for i, layer in enumerate(layer_list) if layer['name'] == split_layer_name), None)
-        if split_index is None:
-            raise ValueError(f"Layer '{split_layer_name}' not found in the model. Split must be done on conv or dense layers")
-        
-        # Split layer_list into two parts
-        layer_list1 = layer_list[:split_index]  
-        layer_list2 = layer_list[split_index:]  # Include the split layer in the second subgraph
-
-        print(layer_list[0], layer_list2[0], layer_list[-1])
-
-        # Create a new input layer for the second subgraph
-        input_layer_dict = {
-            'name': layer_list2[0]['name']+'_input',
-            'class_name': 'InputLayer',
-            'data_format': 'channels_last',
-            'input_shape': [layer_list2[0]['n_in']],
-        }
-
-        # Insert the new input layer at the beginning of layer_list2
-        layer_list2.insert(0, input_layer_dict)
-
-        # Create two ModelGraphs
-        hls_model1 = ModelGraph(config, layer_list1, None, None)
-        hls_model2 = ModelGraph(config, layer_list2, None, None)
+            raise ValueError(f"Split layer must be either Conv. or FC layers")
 
+        hls_model1, hls_model2 = ModelGraph.make_multi_graph(config, layer_list, split_layer_name)
         return hls_model1, hls_model2
-
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
-    return hls_model
+        return hls_model
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 3136784612..235877b163 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import numpy.ctypeslib as npc
+import copy
 
 from hls4ml.backends import get_backend
 from hls4ml.model.flow import get_flow
@@ -12,7 +13,6 @@
 from hls4ml.model.optimizer import get_available_passes, optimize_model
 from hls4ml.utils.string_utils import convert_to_snake_case
 
-
 class HLSConfig:
     """The configuration class as stored in the ModelGraph.
 
@@ -897,3 +897,54 @@ def build(self, **kwargs):
             self.write()
 
         return self.config.backend.build(self, **kwargs)
+
+    @classmethod
+    def make_multi_graph(cls, config, layer_list, split_layer_name):
+        """Splits the layer list into two at the specified layer and creates two ModelGraphs. 
+
+        Args:
+            config (dict): The configuration dictionary.
+            layer_list (list(dict)): The list of layers.
+            split_layer_name (str): The name of the layer to split at.
+
+        Returns:
+            Tuple[ModelGraph, ModelGraph]: Two ModelGraph instances resulting from the split.
+        """
+        layer_names = [layer['name'] for layer in layer_list]
+        if split_layer_name is None or split_layer_name not in layer_names:
+            raise ValueError(f"Layer '{split_layer_name}' not found in the model.")
+
+        split_index = layer_names.index(split_layer_name)
+        layer_list1 = layer_list[:split_index]
+        layer_list2 = layer_list[split_index:]  # Include the split layer in the second subgraph
+
+        # Create new input layer for the second subgraph
+        split_layer = layer_list2[0]
+
+        #NOTE - Additional testing needed to verify that the input shape is correctly identified
+        input_shape = split_layer.get('n_in', None)
+        if input_shape is None:
+            raise ValueError(f"Could not find input_shape of '{split_layer_name}'.")
+        
+        input_layer_dict = {
+            'name': split_layer['name'] + '_input',
+            'class_name': 'InputLayer',
+            'data_format': 'channels_last',
+            'input_shape': [input_shape],
+        }
+
+        # Insert the new input layer at the beginning of layer_list2
+        layer_list2.insert(0, input_layer_dict)
+
+        # Create two ModelGraphs
+        #NOTE - Maybe create a method inside HLSConfig class that sets OutputDir value
+        original_OutputDir = config['OutputDir']
+
+        hls_model1 = ModelGraph(config, layer_list1, None, None)
+        hls_model2 = ModelGraph(copy.copy(config), layer_list2, None, None) # copy only the top-level objects with shallow copy
+
+        # Change output directory of each graph.
+        hls_model1.config.config['OutputDir'] = original_OutputDir + '_graph1'
+        hls_model2.config.config['OutputDir'] = original_OutputDir + '_graph2'
+
+        return hls_model1, hls_model2

From f4a77bb22d007ffcdb79ae69ea28d262dbbba69b Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 15 Oct 2024 17:41:43 +0200
Subject: [PATCH 04/50] make_multi_graph can now support arbitrary number of
 graphs

* takes as input the split_layer_names as split points
* returns a list of ModelGraph
* works for dense/fc layers at the moment
* need to find input_shape of split layer for conv layers. Currenly in dense/fc we find it through 'n_in' key
---
 hls4ml/converters/__init__.py     |  6 +-
 hls4ml/converters/keras_to_hls.py | 16 +++---
 hls4ml/model/graph.py             | 92 ++++++++++++++++++-------------
 3 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 0ba8e9dfdf..016fe18a77 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -214,10 +214,10 @@ def convert_from_keras_model(
 
     _check_hls_config(config, hls_config)
 
-    # Retrieve 'split_layer_name' from kwargs, if provided, for multi-graph creation
-    split_layer_name = kwargs.get('split_layer_name', None)
+    # Retrieve 'split_layer_names' from kwargs, if provided, for multi-graph creation
+    split_layer_names = kwargs.get('split_layer_names', [])
 
-    return keras_to_hls(config, split_layer_name=split_layer_name)
+    return keras_to_hls(config, split_layer_names=split_layer_names)
 
 
 @requires('_torch')
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index e1948b690d..b7e82da8b1 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -322,17 +322,19 @@ def parse_keras_model(model_arch, reader):
     return layer_list, input_layers, output_layers, output_shapes
 
 
-def keras_to_hls(config, split_layer_name = None):
+def keras_to_hls(config, split_layer_names = []):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     
     print('Creating HLS model...')
-    if split_layer_name is not None:
-        if 'conv' not in split_layer_name and 'fc' not in split_layer_name:
-            raise ValueError(f"Split layer must be either Conv. or FC layers")
-
-        hls_model1, hls_model2 = ModelGraph.make_multi_graph(config, layer_list, split_layer_name)
-        return hls_model1, hls_model2
+    if split_layer_names:
+        if all(name.startswith('fc') or name.startswith('dense') for name in split_layer_names):
+            hls_models = ModelGraph.make_multi_graph(config, layer_list, split_layer_names)
+            print('Multi-graph HLS model created.')
+            return hls_models
+        else:
+            raise ValueError(f"Split layer must be either dense or fc layers")
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
+        print('HLS model created.')
         return hls_model
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 235877b163..bc3ac7731b 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -899,52 +899,68 @@ def build(self, **kwargs):
         return self.config.backend.build(self, **kwargs)
 
     @classmethod
-    def make_multi_graph(cls, config, layer_list, split_layer_name):
-        """Splits the layer list into two at the specified layer and creates two ModelGraphs. 
+    def make_multi_graph(cls, config, layer_list, split_layer_names):
+        """Splits the layer list at the specified layers and creates multiple ModelGraphs.
 
         Args:
             config (dict): The configuration dictionary.
             layer_list (list(dict)): The list of layers.
-            split_layer_name (str): The name of the layer to split at.
+            split_layer_names (List[str]): The names of the layers to split at.
 
         Returns:
-            Tuple[ModelGraph, ModelGraph]: Two ModelGraph instances resulting from the split.
+            List[ModelGraph]: List of ModelGraph instances resulting from the splits.
         """
+        if not split_layer_names:
+            raise ValueError("No split layer names provided.")
+
         layer_names = [layer['name'] for layer in layer_list]
-        if split_layer_name is None or split_layer_name not in layer_names:
-            raise ValueError(f"Layer '{split_layer_name}' not found in the model.")
-
-        split_index = layer_names.index(split_layer_name)
-        layer_list1 = layer_list[:split_index]
-        layer_list2 = layer_list[split_index:]  # Include the split layer in the second subgraph
-
-        # Create new input layer for the second subgraph
-        split_layer = layer_list2[0]
-
-        #NOTE - Additional testing needed to verify that the input shape is correctly identified
-        input_shape = split_layer.get('n_in', None)
-        if input_shape is None:
-            raise ValueError(f"Could not find input_shape of '{split_layer_name}'.")
-        
-        input_layer_dict = {
-            'name': split_layer['name'] + '_input',
-            'class_name': 'InputLayer',
-            'data_format': 'channels_last',
-            'input_shape': [input_shape],
-        }
-
-        # Insert the new input layer at the beginning of layer_list2
-        layer_list2.insert(0, input_layer_dict)
-
-        # Create two ModelGraphs
-        #NOTE - Maybe create a method inside HLSConfig class that sets OutputDir value
-        original_OutputDir = config['OutputDir']
 
-        hls_model1 = ModelGraph(config, layer_list1, None, None)
-        hls_model2 = ModelGraph(copy.copy(config), layer_list2, None, None) # copy only the top-level objects with shallow copy
+        # NOTE - Might need to validate again that split layer names exist in layer list
+        for name in split_layer_names:
+            if name not in layer_names:
+                raise ValueError(f"Layer '{name}' not found in the model.")
+
+        # Get split indices and sort them
+        split_indices = sorted([layer_names.index(name) for name in split_layer_names])
 
-        # Change output directory of each graph.
-        hls_model1.config.config['OutputDir'] = original_OutputDir + '_graph1'
-        hls_model2.config.config['OutputDir'] = original_OutputDir + '_graph2'
+        # Add start and end indices one after the other to cover the entire layer list
+        indices = [0] + split_indices + [len(layer_list)]
 
-        return hls_model1, hls_model2
+        # Split the layer_list into subgraphs
+        subgraphs_layer_lists = []
+        for i in range(len(indices) - 1):
+            start = indices[i]
+            end = indices[i + 1]
+            sub_layer_list = layer_list[start:end]
+            subgraphs_layer_lists.append(sub_layer_list)
+
+        # Create ModelGraphs for each subgraph
+        model_graphs = []
+        original_OutputDir = config['OutputDir']
+        original_ProjectName = config['ProjectName']
+        for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
+            # For subgraphs after the first one, insert a new input layer
+            if idx > 0:
+                current_split_layer = sub_layer_list[0]
+                input_shape = current_split_layer.get('n_in', None)
+                print(current_split_layer)
+                #NOTE - Verify that the input shape is correctly identified
+                if input_shape is None:
+                    raise ValueError(f"Could not find input_shape of '{split_layer_names[idx - 1]}'.")
+                input_layer_dict = {
+                    'name': current_split_layer['name'] + '_input',
+                    'class_name': 'InputLayer',
+                    'data_format': 'channels_last',
+                    'input_shape': [input_shape],
+                }
+                # Insert the new input layer at the beginning
+                sub_layer_list.insert(0, input_layer_dict)
+
+            # Create a shallow copy of the config for each subgraph
+            sub_config = copy.copy(config)
+            sub_config['OutputDir'] = f"{original_OutputDir}_graph{idx + 1}"
+            sub_config['ProjectName'] = f"{original_ProjectName}_graph{idx + 1}"
+            hls_model = ModelGraph(sub_config, sub_layer_list, None, None)
+            model_graphs.append(hls_model)
+
+        return model_graphs

From 851e835140e674d784a3749756e12c9f159c73f3 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 17 Oct 2024 09:36:43 +0200
Subject: [PATCH 05/50] Pass output_shapes to make_multi_graph to detect input
 shapes of split layers

---
 hls4ml/converters/keras_to_hls.py |  4 ++--
 hls4ml/model/graph.py             | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index b7e82da8b1..8d9535b868 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -324,12 +324,12 @@ def parse_keras_model(model_arch, reader):
 
 def keras_to_hls(config, split_layer_names = []):
     model_arch, reader = get_model_arch(config)
-    layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
+    layer_list, input_layers, output_layers, output_shapes = parse_keras_model(model_arch, reader)
     
     print('Creating HLS model...')
     if split_layer_names:
         if all(name.startswith('fc') or name.startswith('dense') for name in split_layer_names):
-            hls_models = ModelGraph.make_multi_graph(config, layer_list, split_layer_names)
+            hls_models = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
             print('Multi-graph HLS model created.')
             return hls_models
         else:
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index bc3ac7731b..afaf757eb9 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -899,7 +899,7 @@ def build(self, **kwargs):
         return self.config.backend.build(self, **kwargs)
 
     @classmethod
-    def make_multi_graph(cls, config, layer_list, split_layer_names):
+    def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         """Splits the layer list at the specified layers and creates multiple ModelGraphs.
 
         Args:
@@ -923,7 +923,7 @@ def make_multi_graph(cls, config, layer_list, split_layer_names):
         # Get split indices and sort them
         split_indices = sorted([layer_names.index(name) for name in split_layer_names])
 
-        # Add start and end indices one after the other to cover the entire layer list
+        # Add start and end indices to cover the entire layer list
         indices = [0] + split_indices + [len(layer_list)]
 
         # Split the layer_list into subgraphs
@@ -941,19 +941,26 @@ def make_multi_graph(cls, config, layer_list, split_layer_names):
         for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
             # For subgraphs after the first one, insert a new input layer
             if idx > 0:
-                current_split_layer = sub_layer_list[0]
-                input_shape = current_split_layer.get('n_in', None)
-                print(current_split_layer)
+                # Get the previous layer's name and output shape
+                previous_layer_index = indices[idx] - 1
+                previous_layer = layer_list[previous_layer_index]
+                previous_layer_name = previous_layer['name']
+                input_shape = output_shapes.get(previous_layer_name, None)
                 #NOTE - Verify that the input shape is correctly identified
                 if input_shape is None:
                     raise ValueError(f"Could not find input_shape of '{split_layer_names[idx - 1]}'.")
+                
+                current_split_layer = sub_layer_list[0]
                 input_layer_dict = {
                     'name': current_split_layer['name'] + '_input',
                     'class_name': 'InputLayer',
                     'data_format': 'channels_last',
-                    'input_shape': [input_shape],
+                    'input_shape': input_shape[1:],
                 }
-                # Insert the new input layer at the beginning
+                # Reset the inputs of the split layer in the current graph
+                #NOTE - Better allow it to automatically determine its inputs
+                sub_layer_list[0]['inputs'] = []
+                # Then insert the new input layer at the beginning
                 sub_layer_list.insert(0, input_layer_dict)
 
             # Create a shallow copy of the config for each subgraph

From 0e0cf11edeabe2d0f01562cdd2f091fd5b2adfb9 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 17 Oct 2024 17:58:11 +0200
Subject: [PATCH 06/50] fixed layer index in the newly created graph

---
 hls4ml/model/graph.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index afaf757eb9..034465c000 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -318,7 +318,7 @@ class ModelGraph:
         outputs (list, optional):  The outputs to the model. If None, determined from layer_list
     """
 
-    def __init__(self, config, layer_list, inputs=None, outputs=None):
+    def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=0):
         self.config = HLSConfig(config)
 
         # keep track of the applied flows
@@ -337,7 +337,7 @@ def __init__(self, config, layer_list, inputs=None, outputs=None):
             )
         self.outputs = self._find_output_variable_names(layer_list, output_layers)
 
-        self.index = 0
+        self.index = initial_index
         self.graph = OrderedDict()  # where the nodes are stored
         self.output_vars = {}
 
@@ -938,6 +938,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         model_graphs = []
         original_OutputDir = config['OutputDir']
         original_ProjectName = config['ProjectName']
+        current_index = 0
         for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
             # For subgraphs after the first one, insert a new input layer
             if idx > 0:
@@ -967,7 +968,15 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             sub_config = copy.copy(config)
             sub_config['OutputDir'] = f"{original_OutputDir}_graph{idx + 1}"
             sub_config['ProjectName'] = f"{original_ProjectName}_graph{idx + 1}"
-            hls_model = ModelGraph(sub_config, sub_layer_list, None, None)
+            hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
+            
+            # Update the current index for the next graph
+            # Get the index of the last element in the graph
+            layer_indices = [layer.index for layer in hls_model.graph.values()]
+            if layer_indices:
+                max_index = max(layer_indices)
+                current_index = max_index - 1 # we have the input layer as well
+            
             model_graphs.append(hls_model)
 
         return model_graphs

From 323236b94723cfa6cbfb30dcd928a9071e5113e0 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 18 Oct 2024 17:43:14 +0200
Subject: [PATCH 07/50] fix minor mistakes

---
 hls4ml/converters/keras_to_hls.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 8d9535b868..3310c2a501 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -328,7 +328,7 @@ def keras_to_hls(config, split_layer_names = []):
     
     print('Creating HLS model...')
     if split_layer_names:
-        if all(name.startswith('fc') or name.startswith('dense') for name in split_layer_names):
+        if all(name.startswith('fc') or name.startswith('dense') or name.startswith('conv') for name in split_layer_names):
             hls_models = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
             print('Multi-graph HLS model created.')
             return hls_models

From f759a3e5fd0a54ba13740636738efaced4f7ed1b Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 24 Oct 2024 15:53:19 +0200
Subject: [PATCH 08/50] Add TCL script for automatic connection of subgraph IPs
 in Vivado

* Automatically scans and add HLS IP cores for subgraphs in Vivado

* Automatically detects interface types used by the IPs (either unpacked or AXI stream) and configures the connections accordingly.

* Also, updated the multigraph logic to copy the precision of the last layer from the previous graph and apply it to the input layer of the next graph.
---
 hls4ml/model/graph.py    |  43 +++++--
 scripts/build_graphs.tcl | 234 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 270 insertions(+), 7 deletions(-)
 create mode 100644 scripts/build_graphs.tcl

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 034465c000..7947baf065 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -6,6 +6,7 @@
 import numpy as np
 import numpy.ctypeslib as npc
 import copy
+import warnings
 
 from hls4ml.backends import get_backend
 from hls4ml.model.flow import get_flow
@@ -939,8 +940,15 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         original_OutputDir = config['OutputDir']
         original_ProjectName = config['ProjectName']
         current_index = 0
+        last_output_precision = None
         for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
-            # For subgraphs after the first one, insert a new input layer
+            
+            # Create a shallow copy of the config for each subgraph
+            sub_config = copy.copy(config)
+            sub_config['OutputDir'] = f"{original_OutputDir}_graph{idx + 1}"
+            sub_config['ProjectName'] = f"{original_ProjectName}_graph{idx + 1}"
+
+            # For subgraphs after the first one, configure new input layer
             if idx > 0:
                 # Get the previous layer's name and output shape
                 previous_layer_index = indices[idx] - 1
@@ -952,8 +960,9 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                     raise ValueError(f"Could not find input_shape of '{split_layer_names[idx - 1]}'.")
                 
                 current_split_layer = sub_layer_list[0]
+                input_layer_name = current_split_layer['name'] + '_input'
                 input_layer_dict = {
-                    'name': current_split_layer['name'] + '_input',
+                    'name': input_layer_name,
                     'class_name': 'InputLayer',
                     'data_format': 'channels_last',
                     'input_shape': input_shape[1:],
@@ -964,12 +973,32 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                 # Then insert the new input layer at the beginning
                 sub_layer_list.insert(0, input_layer_dict)
 
-            # Create a shallow copy of the config for each subgraph
-            sub_config = copy.copy(config)
-            sub_config['OutputDir'] = f"{original_OutputDir}_graph{idx + 1}"
-            sub_config['ProjectName'] = f"{original_ProjectName}_graph{idx + 1}"
-            hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
+                # Copy 'Precision' and 'Trace' from the previous layer's config to the new input layer's config
+                if previous_layer_name in sub_config['HLSConfig']['LayerName']:
+                    prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
+                    new_layer_config = {}
+                    new_layer_config['Precision'] = prev_layer_config['Precision']
+                    #NOTE - We copy Trace as well but it might be better to reset it
+                    new_layer_config['Trace'] = prev_layer_config['Trace'] 
+                    # copy last layer config from previous graph to the new input layer config of current graph 
+                    sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
+                else:
+                    raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
             
+            hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
+
+            # After creating subgraph, get the precision from the last layer's output. 
+            if hls_model.graph:
+                try:
+                    last_layer = next(reversed(hls_model.graph.values()))
+                    last_output_precision = last_layer.attributes['precision']['result']
+                except (KeyError, AttributeError):
+                    warnings.warn(
+                    "Could not find precision in the last layer."
+                    "Setting 'last_output_precision' to 'auto'."
+                    )
+                    last_output_precision = 'auto'  
+
             # Update the current index for the next graph
             # Get the index of the last element in the graph
             layer_indices = [layer.index for layer in hls_model.graph.values()]
diff --git a/scripts/build_graphs.tcl b/scripts/build_graphs.tcl
new file mode 100644
index 0000000000..c2a96b1804
--- /dev/null
+++ b/scripts/build_graphs.tcl
@@ -0,0 +1,234 @@
+# ======================================================
+# The script connects the output ports of each subgraph IP
+# instance to the input ports of the next one in sequence.
+#
+# Run this script from the base directory containing the
+# subgraph project folders (e.g., {proj_name}_graph1, etc.)
+# ======================================================
+
+puts "###########################################################"
+
+# Project base dir
+set base_dir [pwd]
+
+# Find a directory that ends with "graph1", "graph2", etc.
+set project_dirs [glob -nocomplain -directory $base_dir *graph[0-9]]
+
+# Check if a matching directory is found
+if {[llength $project_dirs] == 0} {
+    puts "Error: No project directory ending with 'graph{id}' found in $base_dir"
+} else {
+    # Get the first matching directory
+    set project_dir [lindex $project_dirs 0]
+    set project_tcl_file [file join $project_dir project.tcl]
+
+    # Check if project.tcl exists and source it
+    if {[file exists $project_tcl_file]} {
+        puts "Sourcing $project_tcl_file from $project_dir"
+        source $project_tcl_file
+    } else {
+        puts "Error: project.tcl not found in $project_dir"
+        exit 1
+    }
+}
+
+puts "###########################################################"
+puts "#   Starting the IP connection process...                  "
+puts "###########################################################"
+
+# Create New Vivado Project
+set project_name "vivado_final_graph"
+file mkdir $project_name
+cd $project_name
+create_project $project_name . -part $part
+
+# Add repositories
+# Initialize the repo count
+set repo_count 0
+# Loop through potential project directories
+for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
+    set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
+    # Check if the repository path exists
+    if {[file isdirectory $repo_path]} {
+        # Add repository path to current project's IP repository paths
+        set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
+        
+        # Increment the repo count
+        incr repo_count
+
+        puts "Added IP repository path: $repo_path"
+    } else {
+        puts "Directory does not exist: $repo_path"
+    }
+}
+
+if { $repo_count == 0 } {
+    puts "No IP repositories were found in the specified directories."
+} else {
+    puts "Total IP repositories added: $repo_count"
+}
+# Rescan repositories
+update_ip_catalog
+
+# Name of the block design 
+set bd_name "design_1"
+create_bd_design $bd_name
+
+# Add IPs to block design
+for {set i 1} {$i <= $repo_count} {incr i} {
+    set vlnv "xilinx.com:hls:myproject_graph$i:1.0"
+    create_bd_cell -type ip -vlnv $vlnv "myproject_graph${i}_0"
+}
+
+# Collect all IP instance names in a list
+set ip_instances {}
+for {set i 1} {$i <= $repo_count} {incr i} {
+    set ip_name "myproject_graph${i}_0"
+    lappend ip_instances $ip_name
+}
+
+# Determine interface type
+set first_ip [lindex $ip_instances 0]
+set first_ip_cell [get_bd_cells $first_ip]
+set first_ip_pins [get_bd_pins -of $first_ip_cell]
+
+set interface_type "unknown"
+foreach port $first_ip_pins {
+    set port_name [get_property NAME $port]
+    if {[string match "*_TDATA" $port_name]} {
+        set interface_type "axi_stream"
+        break
+    } elseif {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name]} {
+        set interface_type "unpacked"
+        break
+    }
+}
+
+if {$interface_type == "unknown"} {
+    puts "Error: Could not determine interface type."
+    exit 1
+} else {
+    puts "Interface type detected: $interface_type"
+}
+
+# Loop over IP instances to connect outputs to inputs
+for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
+    # Get current IP and next IP
+    set ip_i [lindex $ip_instances $i]
+    set ip_i_plus1 [lindex $ip_instances [expr {$i + 1}]]
+
+    # Get bd_cells for each IP
+    set ip_i_cell [get_bd_cells $ip_i]
+    set ip_i_plus1_cell [get_bd_cells $ip_i_plus1]
+
+    if {$interface_type == "unpacked"} {
+        # Existing unpacked interface connection logic
+        # Get all output pins from ip_i
+        set output_ports [get_bd_pins -of $ip_i_cell]
+
+        # Initialize arrays for output ports
+        array unset layer_out_ports_by_index
+        array unset layer_out_vld_ports_by_index
+
+        # Filter output ports and extract indices
+        foreach port $output_ports {
+            set port_name [get_property NAME $port]
+            # Match 'layer_out_<index>' or 'layer<layerN>_out_<index>'
+            if {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name all index]} {
+                set layer_out_ports_by_index($index) $port
+            } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $port_name all index]} {
+                set layer_out_vld_ports_by_index($index) $port
+            }
+        }
+
+        # Get all input pins from ip_i_plus1
+        set input_ports [get_bd_pins -of $ip_i_plus1_cell]
+
+        # Initialize arrays for input ports
+        array unset input_ports_by_index
+        array unset input_vld_ports_by_index
+
+        # Filter input ports and extract indices
+        foreach port $input_ports {
+            set port_name [get_property NAME $port]
+            # Match '{name}_input_{index}'
+            if {[regexp {^\w+_input_(\d+)$} $port_name all index]} {
+                set input_ports_by_index($index) $port
+            } elseif {[regexp {^\w+_input_(\d+)_ap_vld$} $port_name all index]} {
+                set input_vld_ports_by_index($index) $port
+            }
+        }
+
+        # Connect data signals
+        foreach index [array names layer_out_ports_by_index] {
+            set out_port $layer_out_ports_by_index($index)
+            if {[info exists input_ports_by_index($index)]} {
+                set in_port $input_ports_by_index($index)
+                # Connect the ports
+                connect_bd_net $out_port $in_port
+            } else {
+                puts "Warning: No matching input port found for output [get_property NAME $out_port]"
+            }
+        }
+
+        # Connect ap_vld signals
+        foreach index [array names layer_out_vld_ports_by_index] {
+            set out_vld_port $layer_out_vld_ports_by_index($index)
+            if {[info exists input_vld_ports_by_index($index)]} {
+                set in_vld_port $input_vld_ports_by_index($index)
+                # Connect the ports
+                connect_bd_net $out_vld_port $in_vld_port
+            } else {
+                puts "Warning: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
+            }
+        }
+    } elseif {$interface_type == "axi_stream"} {
+        # Get AXI Stream interface pins from ip_i and ip_i_plus1
+        set ip_i_intf_pins [get_bd_intf_pins -of $ip_i_cell]
+        set ip_i_plus1_intf_pins [get_bd_intf_pins -of $ip_i_plus1_cell]
+
+        # Initialize variables
+        set ip_i_axis_master ""
+        set ip_i_plus1_axis_slave ""
+
+        # Identify the Master (output) AXI Stream interface of ip_i
+        foreach intf_pin $ip_i_intf_pins {
+            set pin_name [get_property NAME $intf_pin]
+            # Assuming output interfaces have names ending with 'out'
+            if {[string match "*out" $pin_name]} {
+                set ip_i_axis_master $intf_pin
+                break
+            }
+        }
+
+        # Identify the Slave (input) AXI Stream interface of ip_i_plus1
+        foreach intf_pin $ip_i_plus1_intf_pins {
+            set pin_name [get_property NAME $intf_pin]
+            # Assuming input interfaces have names ending with 'input'
+            if {[string match "*input" $pin_name]} {
+                set ip_i_plus1_axis_slave $intf_pin
+                break
+            }
+        }
+
+        # Check if both interfaces are found
+        if {[string length $ip_i_axis_master] && [string length $ip_i_plus1_axis_slave]} {
+            # Connect the AXI Stream interfaces
+            connect_bd_intf_net $ip_i_axis_master $ip_i_plus1_axis_slave
+            puts "Connected AXI Stream interface between $ip_i and $ip_i_plus1"
+        } else {
+            puts "Warning: Could not find matching AXI Stream interfaces for $ip_i and $ip_i_plus1"
+        }
+    }
+}
+
+save_bd_design
+
+regenerate_bd_layout
+close_project
+
+puts "###########################################################"                                     
+puts "#   Successfully connected the ports of each IP instance   "
+puts "#   from '[lindex $ip_instances 0]' to '[lindex $ip_instances [expr {$repo_count - 1}]]'."
+puts "#   A total of $repo_count IPs were connected.             "
+puts "###########################################################"

From a5f82771e2ead1958c190d25dc476952e5ae7ab9 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 29 Oct 2024 17:33:01 +0100
Subject: [PATCH 09/50] some minor fixes in tcl script and make_multi_graph

---
 hls4ml/converters/keras_to_hls.py |  5 ++++-
 hls4ml/model/graph.py             | 24 ++++++++++++++----------
 scripts/build_graphs.tcl          | 26 +++++++++++++++-----------
 3 files changed, 33 insertions(+), 22 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 3310c2a501..237bfd33b2 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -328,7 +328,10 @@ def keras_to_hls(config, split_layer_names = []):
     
     print('Creating HLS model...')
     if split_layer_names:
-        if all(name.startswith('fc') or name.startswith('dense') or name.startswith('conv') for name in split_layer_names):
+        if all(name.startswith('fc') or name.startswith('dense') or 
+               name.startswith('conv') or
+               name.startswith('activation') or name.startswith('relu') 
+               for name in split_layer_names):
             hls_models = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
             print('Multi-graph HLS model created.')
             return hls_models
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 7947baf065..b05feba515 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -974,16 +974,20 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                 sub_layer_list.insert(0, input_layer_dict)
 
                 # Copy 'Precision' and 'Trace' from the previous layer's config to the new input layer's config
-                if previous_layer_name in sub_config['HLSConfig']['LayerName']:
-                    prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
-                    new_layer_config = {}
-                    new_layer_config['Precision'] = prev_layer_config['Precision']
-                    #NOTE - We copy Trace as well but it might be better to reset it
-                    new_layer_config['Trace'] = prev_layer_config['Trace'] 
-                    # copy last layer config from previous graph to the new input layer config of current graph 
-                    sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
+                if 'LayerName' in sub_config['HLSConfig']:    
+                    if previous_layer_name in sub_config['HLSConfig']['LayerName']:
+                        prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
+                        new_layer_config = {}
+                        new_layer_config['Precision'] = prev_layer_config['Precision']
+                        #NOTE - We copy Trace as well but it might be better to reset it
+                        new_layer_config['Trace'] = prev_layer_config['Trace'] 
+                        # copy last layer config from previous graph to the new input layer config of current graph 
+                        sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
+                    else:
+                        raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
                 else:
-                    raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
+                    # case of granularity='Model'
+                    pass
             
             hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
 
@@ -994,7 +998,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                     last_output_precision = last_layer.attributes['precision']['result']
                 except (KeyError, AttributeError):
                     warnings.warn(
-                    "Could not find precision in the last layer."
+                    "Could not find precision in the last layer. "
                     "Setting 'last_output_precision' to 'auto'."
                     )
                     last_output_precision = 'auto'  
diff --git a/scripts/build_graphs.tcl b/scripts/build_graphs.tcl
index c2a96b1804..078dd57387 100644
--- a/scripts/build_graphs.tcl
+++ b/scripts/build_graphs.tcl
@@ -43,32 +43,36 @@ cd $project_name
 create_project $project_name . -part $part
 
 # Add repositories
-# Initialize the repo count
+# Loop through each project directory
 set repo_count 0
-# Loop through potential project directories
-for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
-    set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
+foreach dir $project_dirs { 
+    # Check if we have exactly one _prj directory
+    set prj_dirs [glob -directory $dir -type d "*_prj"]
+    if {[llength $prj_dirs] != 1} {
+        error "Expected exactly one *_prj directory, but found [llength $prj_dirs] in directory: $dir"
+    }
+    # If exactly one *_prj is found, proceed to construct the repo_path
+    set first_prj_dir [lindex $prj_dirs 0]
+    set repo_path "${first_prj_dir}/solution1/impl/ip"
+    
     # Check if the repository path exists
     if {[file isdirectory $repo_path]} {
         # Add repository path to current project's IP repository paths
         set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
-        
-        # Increment the repo count
         incr repo_count
-
-        puts "Added IP repository path: $repo_path"
     } else {
         puts "Directory does not exist: $repo_path"
     }
 }
 
+# Rescan repositories
+update_ip_catalog
+
 if { $repo_count == 0 } {
     puts "No IP repositories were found in the specified directories."
 } else {
     puts "Total IP repositories added: $repo_count"
 }
-# Rescan repositories
-update_ip_catalog
 
 # Name of the block design 
 set bd_name "design_1"
@@ -231,4 +235,4 @@ puts "###########################################################"
 puts "#   Successfully connected the ports of each IP instance   "
 puts "#   from '[lindex $ip_instances 0]' to '[lindex $ip_instances [expr {$repo_count - 1}]]'."
 puts "#   A total of $repo_count IPs were connected.             "
-puts "###########################################################"
+puts "###########################################################"
\ No newline at end of file

From 07d23ae8e97d9638682ea9b5032e38cd119a9804 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 31 Oct 2024 17:28:05 +0100
Subject: [PATCH 10/50] support for parallel subgraph builds. Also,
 make_multi_graph now returns a MultiModelGraph instance

---
 hls4ml/backends/vitis/vitis_backend.py | 43 +++++++-----
 hls4ml/model/graph.py                  | 95 +++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index cf623bb19a..bac04de185 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import subprocess
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
@@ -100,23 +101,29 @@ def build(
             if found != 0:
                 raise Exception('Vitis HLS installation not found. Make sure "vitis_hls" is on PATH.')
 
-        curr_dir = os.getcwd()
-        os.chdir(model.config.get_output_dir())
-        os.system(
-            (
-                'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} '
-                'validation={validation} export={export} vsynth={vsynth} fifo_opt={fifo_opt}"'
-            ).format(
-                reset=reset,
-                csim=csim,
-                synth=synth,
-                cosim=cosim,
-                validation=validation,
-                export=export,
-                vsynth=vsynth,
-                fifo_opt=fifo_opt,
+        build_command = (
+            'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} '
+            'validation={validation} export={export} vsynth={vsynth} fifo_opt={fifo_opt}"'
+        ).format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth, fifo_opt=fifo_opt)
+
+        output_dir = model.config.get_output_dir()
+        # Define log file paths
+        # NOTE - 'build_stdout.log' is the same as 'vitis_hls.log'
+        stdout_log = os.path.join(output_dir, 'build_stdout.log')
+        stderr_log = os.path.join(output_dir, 'build_stderr.log')
+        
+        with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
+            # Use subprocess.Popen to capture output
+            process = subprocess.Popen(
+                build_command,
+                shell=True,
+                cwd=output_dir,
+                stdout=stdout_file,
+                stderr=stderr_file,
+                text=True
             )
-        )
-        os.chdir(curr_dir)
+            process.communicate()
+            if process.returncode != 0:
+                raise Exception(f'Build failed for {model.config.get_project_name()}. See logs for details.')
 
-        return parse_vivado_report(model.config.get_output_dir())
+        return parse_vivado_report(output_dir)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index b05feba515..204ab06ed5 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -7,6 +7,8 @@
 import numpy.ctypeslib as npc
 import copy
 import warnings
+import concurrent.futures
+import threading
 
 from hls4ml.backends import get_backend
 from hls4ml.model.flow import get_flow
@@ -986,8 +988,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                     else:
                         raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
                 else:
-                    # case of granularity='Model'
-                    pass
+                    pass # case of granularity='Model'
             
             hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
 
@@ -1012,4 +1013,92 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             
             model_graphs.append(hls_model)
 
-        return model_graphs
+        return MultiModelGraph(model_graphs)
+
+
+class MultiModelGraph:
+    def __init__(self, graphs):
+        self.graphs = graphs
+
+    def build(self, max_workers=None, **kwargs):
+        # Build all ModelGraph instances in parallel.
+        build_results = {}
+        total_builds = len(self.graphs)
+        status = {}
+        status_lock = threading.Lock()
+
+        # Initialize statuses
+        for g in self.graphs:
+            project_name = g.config.get_project_name()
+            status[project_name] = 'Pending'
+
+        def build_wrapper(g, **kwargs):
+            project_name = g.config.get_project_name()
+            with status_lock:
+                status[project_name] = 'Running'
+                self._print_status(status)
+            try:
+                result = g.build(**kwargs)
+                with status_lock:
+                    status[project_name] = 'Completed'
+                    self._print_status(status)
+                return result
+            except Exception as exc:
+                with status_lock:
+                    status[project_name] = 'Failed'
+                    self._print_status(status)
+                raise
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            future_to_g = {executor.submit(build_wrapper, g, **kwargs): g for g in self.graphs}
+            for future in concurrent.futures.as_completed(future_to_g):
+                g = future_to_g[future]
+                project_name = g.config.get_project_name()
+                try:
+                    result = future.result()
+                    build_results[project_name] = result
+                except Exception as exc:
+                    build_results[project_name] = None
+        return build_results
+
+    def _print_status(self, status):
+        # Clear the terminal line and print build status
+        print('\r', end='')
+        status_str = ' | '.join(f'{proj}: {stat}' for proj, stat in status.items())
+        print(status_str, end='', flush=True)
+
+    def compile(self):
+        for g in self.graphs:
+            g.compile()
+
+    def predict(self, x):
+        # Pass the data through each ModelGraph in sequence
+        input_data = x
+        for g in self.graphs:
+            # Predict with the current ModelGraph
+            output_data = g.predict(input_data)
+            input_data = output_data
+        return output_data
+
+    def trace(self, x):
+        # Pass the data through each ModelGraph in sequence
+        input_data = x
+        trace_output = []
+        for g in self.graphs:
+            # Trace with the current ModelGraph
+            output_data, curr_trace_output = g.trace(input_data)
+            input_data = output_data
+            trace_output.append(curr_trace_output)
+        return output_data, trace_output
+    
+    def _print_status(self, status):
+        # Clear the terminal line and print build status
+        print('\r', end='')
+        status_icons = {
+            'Pending': '○',
+            'Running': '⌛',
+            'Completed': '✅',
+            'Failed': '❌'
+        }
+        status_str = ' | '.join(f'{proj}: {status_icons.get(stat, "?")}' for proj, stat in status.items())
+        print(status_str, flush=True)
\ No newline at end of file

From 5dc4ac66b3884a29f7534d22092e9fe4da96ad09 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 12 Nov 2024 17:45:28 +0100
Subject: [PATCH 11/50] new tcl script

---
 hls4ml/model/graph.py    |   6 -
 scripts/build_graphs.tcl | 274 +++++++++++++++++++++++++++++++++++----
 2 files changed, 252 insertions(+), 28 deletions(-)

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 204ab06ed5..4e119fc9a4 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1061,12 +1061,6 @@ def build_wrapper(g, **kwargs):
                     build_results[project_name] = None
         return build_results
 
-    def _print_status(self, status):
-        # Clear the terminal line and print build status
-        print('\r', end='')
-        status_str = ' | '.join(f'{proj}: {stat}' for proj, stat in status.items())
-        print(status_str, end='', flush=True)
-
     def compile(self):
         for g in self.graphs:
             g.compile()
diff --git a/scripts/build_graphs.tcl b/scripts/build_graphs.tcl
index 078dd57387..c0302e22db 100644
--- a/scripts/build_graphs.tcl
+++ b/scripts/build_graphs.tcl
@@ -2,6 +2,17 @@
 # The script connects the output ports of each subgraph IP
 # instance to the input ports of the next one in sequence.
 #
+# Modifications:
+# - Connect and make external signals starting with 'ap_clk' and 'ap_rst'.
+# - For AXI Stream interfaces:
+#   - Connect and make external signals starting with 'ap_start'.
+#   - Make external the input of the first IP and the output of the last IP.
+# - For unpacked interfaces:
+#   - Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1.
+#   - Make external 'ap_start' of the first IP and 'ap_done' of the last IP.
+#   - Make external the inputs of the first IP and outputs of the last IP (including 'vld' signals).
+# - Make external the 'ap_done' signal of the last IP.
+#
 # Run this script from the base directory containing the
 # subgraph project folders (e.g., {proj_name}_graph1, etc.)
 # ======================================================
@@ -36,6 +47,7 @@ puts "###########################################################"
 puts "#   Starting the IP connection process...                  "
 puts "###########################################################"
 
+
 # Create New Vivado Project
 set project_name "vivado_final_graph"
 file mkdir $project_name
@@ -43,39 +55,35 @@ cd $project_name
 create_project $project_name . -part $part
 
 # Add repositories
-# Loop through each project directory
+# Initialize the repo count
 set repo_count 0
-foreach dir $project_dirs { 
-    # Check if we have exactly one _prj directory
-    set prj_dirs [glob -directory $dir -type d "*_prj"]
-    if {[llength $prj_dirs] != 1} {
-        error "Expected exactly one *_prj directory, but found [llength $prj_dirs] in directory: $dir"
-    }
-    # If exactly one *_prj is found, proceed to construct the repo_path
-    set first_prj_dir [lindex $prj_dirs 0]
-    set repo_path "${first_prj_dir}/solution1/impl/ip"
-    
+# Loop through potential project directories
+for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
+    set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
     # Check if the repository path exists
     if {[file isdirectory $repo_path]} {
         # Add repository path to current project's IP repository paths
         set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
+        
+        # Increment the repo count
         incr repo_count
+
+        puts "Added IP repository path: $repo_path"
     } else {
         puts "Directory does not exist: $repo_path"
     }
 }
 
-# Rescan repositories
-update_ip_catalog
-
 if { $repo_count == 0 } {
     puts "No IP repositories were found in the specified directories."
 } else {
     puts "Total IP repositories added: $repo_count"
 }
+# Rescan repositories
+update_ip_catalog
 
 # Name of the block design 
-set bd_name "design_1"
+set bd_name "stitched_design"
 create_bd_design $bd_name
 
 # Add IPs to block design
@@ -91,6 +99,56 @@ for {set i 1} {$i <= $repo_count} {incr i} {
     lappend ip_instances $ip_name
 }
 
+# Collect 'ap_clk' and 'ap_rst' signals from all IPs
+set ap_clk_ports {}
+set ap_rst_ports {}
+
+foreach ip $ip_instances {
+    set ip_cell [get_bd_cells $ip]
+    set ip_pins [get_bd_pins -of $ip_cell]
+    foreach pin $ip_pins {
+        set pin_name [get_property NAME $pin]
+        if {[string match "ap_clk" $pin_name]} {
+            lappend ap_clk_ports $pin
+        } elseif {[string match "ap_rst" $pin_name]} {
+            lappend ap_rst_ports $pin
+        }
+    }
+}
+
+# Create external ports for 'ap_clk' and 'ap_rst'
+# ap_clk
+if {[llength $ap_clk_ports] > 0} {
+    create_bd_port -dir I -type clk -freq_hz 100000000 ap_clk
+    set ap_clk_port [get_bd_ports ap_clk]
+    # Connect all 'ap_clk' pins to the 'ap_clk' port
+    foreach clk_pin $ap_clk_ports {
+        connect_bd_net $ap_clk_port $clk_pin
+    }
+}
+
+# ap_rst
+if {[llength $ap_rst_ports] > 0} {
+    # Get the CONFIG.POLARITY property from one of the IP's 'ap_rst' pins
+    set sample_rst_pin [lindex $ap_rst_ports 0]
+    set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
+    # Create the 'ap_rst' port
+    create_bd_port -dir I -type rst ap_rst
+    set ap_rst_port [get_bd_ports ap_rst]
+    
+    # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
+    if {$rst_polarity ne ""} {
+        set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
+    } else {
+        # Fallback to ACTIVE_HIGH if the retrieved polarity is not defined
+        set_property CONFIG.POLARITY ACTIVE_HIGH $ap_rst_port
+    }
+    # Connect all 'ap_rst' pins to the 'ap_rst' port
+    foreach rst_pin $ap_rst_ports {
+        connect_bd_net $ap_rst_port $rst_pin
+    }
+}
+
 # Determine interface type
 set first_ip [lindex $ip_instances 0]
 set first_ip_cell [get_bd_cells $first_ip]
@@ -115,6 +173,19 @@ if {$interface_type == "unknown"} {
     puts "Interface type detected: $interface_type"
 }
 
+# Collect 'ap_start' signals from all IPs
+set ap_start_ports {}
+foreach ip $ip_instances {
+    set ip_cell [get_bd_cells $ip]
+    set ip_pins [get_bd_pins -of $ip_cell]
+    foreach pin $ip_pins {
+        set pin_name [get_property NAME $pin]
+        if {[string match "ap_start" $pin_name]} {
+            lappend ap_start_ports $pin
+        }
+    }
+}
+
 # Loop over IP instances to connect outputs to inputs
 for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
     # Get current IP and next IP
@@ -186,6 +257,36 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
                 puts "Warning: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
             }
         }
+
+        # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
+        # Get 'ap_done' pin of ip_i
+        set ip_i_pins [get_bd_pins -of $ip_i_cell]
+        set ap_done_pin ""
+        foreach pin $ip_i_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_done" $pin_name]} {
+                set ap_done_pin $pin
+                break
+            }
+        }
+
+        # Get 'ap_start' pin of ip_i_plus1
+        set ip_i_plus1_pins [get_bd_pins -of $ip_i_plus1_cell]
+        set ap_start_pin ""
+        foreach pin $ip_i_plus1_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_start" $pin_name]} {
+                set ap_start_pin $pin
+                break
+            }
+        }
+
+        # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
+        if {[string length $ap_done_pin] > 0 && [string length $ap_start_pin] > 0} {
+            connect_bd_net $ap_done_pin $ap_start_pin
+        } else {
+            puts "Warning: Could not find 'ap_done' or 'ap_start' pin for IPs $ip_i and $ip_i_plus1"
+        }
     } elseif {$interface_type == "axi_stream"} {
         # Get AXI Stream interface pins from ip_i and ip_i_plus1
         set ip_i_intf_pins [get_bd_intf_pins -of $ip_i_cell]
@@ -226,13 +327,142 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
     }
 }
 
+if {$interface_type == "axi_stream"} {
+    # Create external port for 'ap_start' and connect all 'ap_start' pins
+    if {[llength $ap_start_ports] > 0} {
+        create_bd_port -dir I ap_start
+        set ap_start_port [get_bd_ports ap_start]
+        foreach start_pin $ap_start_ports {
+            connect_bd_net $ap_start_port $start_pin
+        }
+    }
+
+    # Make external the input interface of the first IP
+    set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+    set first_ip_intf_pins [get_bd_intf_pins -of $first_ip_cell]
+    set first_ip_axis_slave ""
+    foreach intf_pin $first_ip_intf_pins {
+        set pin_name [get_property NAME $intf_pin]
+        if {[string match "*input" $pin_name]} {
+            set first_ip_axis_slave $intf_pin
+            break
+        }
+    }
+    if {[string length $first_ip_axis_slave] > 0} {
+        create_bd_intf_port -mode Slave -vlnv [get_property VLNV $first_ip_axis_slave] [get_property NAME $first_ip_axis_slave]
+        set external_intf_port [get_bd_intf_ports [get_property NAME $first_ip_axis_slave]]
+        connect_bd_intf_net $external_intf_port $first_ip_axis_slave
+    } else {
+        puts "Warning: Could not find input AXI Stream interface for first IP"
+    }
+
+    # Make external the output interface of the last IP
+    set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+    set last_ip_intf_pins [get_bd_intf_pins -of $last_ip_cell]
+    set last_ip_axis_master ""
+    foreach intf_pin $last_ip_intf_pins {
+        set pin_name [get_property NAME $intf_pin]
+        if {[string match "*out" $pin_name]} {
+            set last_ip_axis_master $intf_pin
+            break
+        }
+    }
+    if {[string length $last_ip_axis_master] > 0} {
+        create_bd_intf_port -mode Master -vlnv [get_property VLNV $last_ip_axis_master] [get_property NAME $last_ip_axis_master]
+        set external_intf_port [get_bd_intf_ports [get_property NAME $last_ip_axis_master]]
+        connect_bd_intf_net $external_intf_port $last_ip_axis_master
+    } else {
+        puts "Warning: Could not find output AXI Stream interface for last IP"
+    }
+
+    # Make external the 'ap_done' signal of the last IP
+    set last_ip_pins [get_bd_pins -of $last_ip_cell]
+    set last_ap_done_pin ""
+    foreach pin $last_ip_pins {
+        set pin_name [get_property NAME $pin]
+        if {[string match "ap_done" $pin_name]} {
+            set last_ap_done_pin $pin
+            break
+        }
+    }
+    if {[string length $last_ap_done_pin] > 0} {
+        create_bd_port -dir O ap_done
+        set ap_done_port [get_bd_ports ap_done]
+        connect_bd_net $ap_done_port $last_ap_done_pin
+    } else {
+        puts "Warning: Could not find 'ap_done' pin for last IP"
+    }
+} elseif {$interface_type == "unpacked"} {
+    # Make 'ap_start' of the first IP external
+    set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+    set first_ip_pins [get_bd_pins -of $first_ip_cell]
+    set first_ap_start_pin ""
+    foreach pin $first_ip_pins {
+        set pin_name [get_property NAME $pin]
+        if {[string match "ap_start" $pin_name]} {
+            set first_ap_start_pin $pin
+            break
+        }
+    }
+    if {[string length $first_ap_start_pin] > 0} {
+        create_bd_port -dir I ap_start
+        set ap_start_port [get_bd_ports ap_start]
+        connect_bd_net $ap_start_port $first_ap_start_pin
+    } else {
+        puts "Warning: Could not find 'ap_start' pin for first IP"
+    }
+
+    # Make 'ap_done' of the last IP external
+    set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+    set last_ip_pins [get_bd_pins -of $last_ip_cell]
+    set last_ap_done_pin ""
+    foreach pin $last_ip_pins {
+        set pin_name [get_property NAME $pin]
+        if {[string match "ap_done" $pin_name]} {
+            set last_ap_done_pin $pin
+            break
+        }
+    }
+    if {[string length $last_ap_done_pin] > 0} {
+        create_bd_port -dir O ap_done
+        set ap_done_port [get_bd_ports ap_done]
+        connect_bd_net $ap_done_port $last_ap_done_pin
+    } else {
+        puts "Warning: Could not find 'ap_done' pin for last IP"
+    }
+
+    # Make external the inputs of the first IP (including 'vld' signals)
+    set first_ip_input_ports [get_bd_pins -of $first_ip_cell]
+    foreach pin $first_ip_input_ports {
+        set pin_name [get_property NAME $pin]
+        if {[regexp {^\w+_input_(\d+)$} $pin_name all index]} {
+            create_bd_port -dir I $pin_name
+            set external_port [get_bd_ports $pin_name]
+            connect_bd_net $external_port $pin
+        } elseif {[regexp {^\w+_input_(\d+)_ap_vld$} $pin_name all index]} {
+            create_bd_port -dir I $pin_name
+            set external_port [get_bd_ports $pin_name]
+            connect_bd_net $external_port $pin
+        }
+    }
+
+    # Make external the outputs of the last IP (including 'vld' signals)
+    set last_ip_output_ports [get_bd_pins -of $last_ip_cell]
+    foreach pin $last_ip_output_ports {
+        set pin_name [get_property NAME $pin]
+        if {[regexp {^layer(?:\d+_)?out_(\d+)$} $pin_name all index]} {
+            create_bd_port -dir O $pin_name
+            set external_port [get_bd_ports $pin_name]
+            connect_bd_net $external_port $pin
+        } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $pin_name all index]} {
+            create_bd_port -dir O $pin_name
+            set external_port [get_bd_ports $pin_name]
+            connect_bd_net $external_port $pin
+        }
+    }
+}
+
 save_bd_design
 
 regenerate_bd_layout
-close_project
-
-puts "###########################################################"                                     
-puts "#   Successfully connected the ports of each IP instance   "
-puts "#   from '[lindex $ip_instances 0]' to '[lindex $ip_instances [expr {$repo_count - 1}]]'."
-puts "#   A total of $repo_count IPs were connected.             "
-puts "###########################################################"
\ No newline at end of file
+close_project
\ No newline at end of file

From 202991d1c433a08bfad5e145666388ea998f4564 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 13 Nov 2024 17:20:23 +0100
Subject: [PATCH 12/50] connected external and control signals

---
 scripts/build_graphs.tcl | 121 +++++++++++++++++++++++++--------------
 1 file changed, 78 insertions(+), 43 deletions(-)

diff --git a/scripts/build_graphs.tcl b/scripts/build_graphs.tcl
index c0302e22db..553dd5e6b4 100644
--- a/scripts/build_graphs.tcl
+++ b/scripts/build_graphs.tcl
@@ -1,17 +1,7 @@
 # ======================================================
 # The script connects the output ports of each subgraph IP
-# instance to the input ports of the next one in sequence.
-#
-# Modifications:
-# - Connect and make external signals starting with 'ap_clk' and 'ap_rst'.
-# - For AXI Stream interfaces:
-#   - Connect and make external signals starting with 'ap_start'.
-#   - Make external the input of the first IP and the output of the last IP.
-# - For unpacked interfaces:
-#   - Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1.
-#   - Make external 'ap_start' of the first IP and 'ap_done' of the last IP.
-#   - Make external the inputs of the first IP and outputs of the last IP (including 'vld' signals).
-# - Make external the 'ap_done' signal of the last IP.
+# instance to the input ports of the next one in sequence,
+# and makes important signals as external
 #
 # Run this script from the base directory containing the
 # subgraph project folders (e.g., {proj_name}_graph1, etc.)
@@ -108,9 +98,9 @@ foreach ip $ip_instances {
     set ip_pins [get_bd_pins -of $ip_cell]
     foreach pin $ip_pins {
         set pin_name [get_property NAME $pin]
-        if {[string match "ap_clk" $pin_name]} {
+        if {[string match "ap_clk*" $pin_name]} {
             lappend ap_clk_ports $pin
-        } elseif {[string match "ap_rst" $pin_name]} {
+        } elseif {[string match "ap_rst*" $pin_name]} {
             lappend ap_rst_ports $pin
         }
     }
@@ -339,42 +329,64 @@ if {$interface_type == "axi_stream"} {
 
     # Make external the input interface of the first IP
     set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+    if {[string length $first_ip_cell] == 0} {
+        puts "Error: Could not find the first IP cell."
+        return
+    }
     set first_ip_intf_pins [get_bd_intf_pins -of $first_ip_cell]
     set first_ip_axis_slave ""
     foreach intf_pin $first_ip_intf_pins {
         set pin_name [get_property NAME $intf_pin]
-        if {[string match "*input" $pin_name]} {
+        if {[string match "*s_axis*" $pin_name] || [string match "*input*" $pin_name]} {
             set first_ip_axis_slave $intf_pin
             break
         }
     }
     if {[string length $first_ip_axis_slave] > 0} {
-        create_bd_intf_port -mode Slave -vlnv [get_property VLNV $first_ip_axis_slave] [get_property NAME $first_ip_axis_slave]
-        set external_intf_port [get_bd_intf_ports [get_property NAME $first_ip_axis_slave]]
-        connect_bd_intf_net $external_intf_port $first_ip_axis_slave
+        # Make the interface pin external
+        make_bd_intf_pins_external $first_ip_axis_slave
+        # Retrieve the external interface port
+        set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+        # Change name to base_name and associate clock
+        set_property NAME $pin_name $external_intf_port
+        set input_pin_name $pin_name
     } else {
-        puts "Warning: Could not find input AXI Stream interface for first IP"
+        puts "Error: Could not find input AXI Stream interface for first IP."
+        return
     }
 
+
     # Make external the output interface of the last IP
     set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+    if {[string length $last_ip_cell] == 0} {
+        puts "Error: Could not find the last IP cell."
+        return
+    }
     set last_ip_intf_pins [get_bd_intf_pins -of $last_ip_cell]
     set last_ip_axis_master ""
     foreach intf_pin $last_ip_intf_pins {
         set pin_name [get_property NAME $intf_pin]
-        if {[string match "*out" $pin_name]} {
+          if {[string match "*m_axis*" $pin_name] || [string match "*out*" $pin_name]} {
             set last_ip_axis_master $intf_pin
             break
         }
     }
     if {[string length $last_ip_axis_master] > 0} {
-        create_bd_intf_port -mode Master -vlnv [get_property VLNV $last_ip_axis_master] [get_property NAME $last_ip_axis_master]
-        set external_intf_port [get_bd_intf_ports [get_property NAME $last_ip_axis_master]]
-        connect_bd_intf_net $external_intf_port $last_ip_axis_master
+        # Make the interface pin external
+        make_bd_intf_pins_external $last_ip_axis_master
+        # Retrieve the external interface port
+        set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+        # Change name to base_name and associate clock
+        set_property NAME $pin_name $external_intf_port
+        set output_pin_name $pin_name
     } else {
-        puts "Warning: Could not find output AXI Stream interface for last IP"
+        puts "Error: Could not find output AXI Stream interface for last IP."
+        return
     }
 
+    # associate input and output bus interfaces to run at ap_clk
+    set_property CONFIG.ASSOCIATED_BUSIF [list "${input_pin_name}:${output_pin_name}"] [get_bd_ports /ap_clk]
+    
     # Make external the 'ap_done' signal of the last IP
     set last_ip_pins [get_bd_pins -of $last_ip_cell]
     set last_ap_done_pin ""
@@ -392,6 +404,7 @@ if {$interface_type == "axi_stream"} {
     } else {
         puts "Warning: Could not find 'ap_done' pin for last IP"
     }
+    
 } elseif {$interface_type == "unpacked"} {
     # Make 'ap_start' of the first IP external
     set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
@@ -434,30 +447,44 @@ if {$interface_type == "axi_stream"} {
     # Make external the inputs of the first IP (including 'vld' signals)
     set first_ip_input_ports [get_bd_pins -of $first_ip_cell]
     foreach pin $first_ip_input_ports {
-        set pin_name [get_property NAME $pin]
-        if {[regexp {^\w+_input_(\d+)$} $pin_name all index]} {
-            create_bd_port -dir I $pin_name
-            set external_port [get_bd_ports $pin_name]
-            connect_bd_net $external_port $pin
-        } elseif {[regexp {^\w+_input_(\d+)_ap_vld$} $pin_name all index]} {
-            create_bd_port -dir I $pin_name
-            set external_port [get_bd_ports $pin_name]
-            connect_bd_net $external_port $pin
-        }
+         set pin_name [get_property NAME $pin]
+         # Match patterns for inputs and input valid pins
+         if {[regexp {^\w+_input_(\d+)$} $pin_name] || [regexp {^\w+_input_(\d+)_ap_vld$} $pin_name]} {
+             # Get pin properties
+             set pin_dir [get_property DIR $pin]
+             set pin_left [get_property LEFT $pin]
+             set pin_right [get_property RIGHT $pin]
+             set pin_type [get_property TYPE $pin]
+             if {$pin_left ne "" && $pin_right ne ""} {
+                 # Create an external port with the same name, bit range and type
+                 set ext_port [create_bd_port -dir $pin_dir -from $pin_left -to $pin_right -type $pin_type $pin_name]
+             } else {
+                 # For single-bit signals where LEFT and RIGHT may not be defined
+                 set ext_port [create_bd_port -dir $pin_dir -type $pin_type $pin_name]
+             }
+             connect_bd_net $ext_port $pin
+         }
     }
 
     # Make external the outputs of the last IP (including 'vld' signals)
     set last_ip_output_ports [get_bd_pins -of $last_ip_cell]
     foreach pin $last_ip_output_ports {
         set pin_name [get_property NAME $pin]
-        if {[regexp {^layer(?:\d+_)?out_(\d+)$} $pin_name all index]} {
-            create_bd_port -dir O $pin_name
-            set external_port [get_bd_ports $pin_name]
-            connect_bd_net $external_port $pin
-        } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $pin_name all index]} {
-            create_bd_port -dir O $pin_name
-            set external_port [get_bd_ports $pin_name]
-            connect_bd_net $external_port $pin
+         # Match patterns for ouputs and output valid pins
+        if {[regexp {^layer(?:\d+_)?out_(\d+)$} $pin_name] || [regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $pin_name]} {
+             # Get pin properties
+             set pin_dir [get_property DIR $pin]
+             set pin_left [get_property LEFT $pin]
+             set pin_right [get_property RIGHT $pin]
+             set pin_type [get_property TYPE $pin]
+             if {$pin_left ne "" && $pin_right ne ""} {
+                 # Create an external port with the same name, bit range and type
+                 set ext_port [create_bd_port -dir $pin_dir -from $pin_left -to $pin_right -type $pin_type $pin_name]
+             } else {
+                 # For single-bit signals where LEFT and RIGHT may not be defined
+                 set ext_port [create_bd_port -dir $pin_dir -type $pin_type $pin_name]
+             }
+             connect_bd_net $ext_port $pin
         }
     }
 }
@@ -465,4 +492,12 @@ if {$interface_type == "axi_stream"} {
 save_bd_design
 
 regenerate_bd_layout
-close_project
\ No newline at end of file
+close_project
+
+
+puts "###########################################################"                                     
+puts "#   Successfully connected the ports of each IP instance   "
+puts "#   from '[lindex $ip_instances 0]' to '[lindex $ip_instances [expr {$repo_count - 1}]]'."
+puts "#   A total of $repo_count IPs were connected.             "
+puts "###########################################################"
+

From dc6072247065cf58b91e8ff776955026216b0512 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 14 Nov 2024 17:45:55 +0100
Subject: [PATCH 13/50] integrate ip_stitcher tcl script in hls4ml

---
 hls4ml/backends/vitis/vitis_backend.py        | 25 +++++++
 hls4ml/model/graph.py                         |  7 ++
 scripts/{build_graphs.tcl => ip_stitcher.tcl} | 75 ++++++++-----------
 3 files changed, 62 insertions(+), 45 deletions(-)
 rename scripts/{build_graphs.tcl => ip_stitcher.tcl} (86%)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index bac04de185..d62442e4a2 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -1,6 +1,7 @@
 import os
 import sys
 import subprocess
+import importlib.util
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
@@ -127,3 +128,27 @@ def build(
                 raise Exception(f'Build failed for {model.config.get_project_name()}. See logs for details.')
 
         return parse_vivado_report(output_dir)
+    
+    def stitch_design(self, output_dir, project_name):
+        os.makedirs(output_dir, exist_ok=True)
+
+        spec = importlib.util.find_spec("hls4ml")
+        hls4ml_path = os.path.dirname(spec.origin)
+        stitch_command = 'vivado -mode batch -nojournal -nolog -notrace -source ' + hls4ml_path + '/../scripts/ip_stitcher.tcl'
+
+        stdout_log = os.path.join(output_dir, 'stitcher_stdout.log')
+        stderr_log = os.path.join(output_dir, 'stitcher_stderr.log')
+        
+        with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
+            # Use subprocess.Popen to capture output
+            process = subprocess.Popen(
+                stitch_command,
+                shell=True,
+                cwd=output_dir,
+                stdout=stdout_file,
+                stderr=stderr_file,
+                text=True
+            )
+            process.communicate()
+            if process.returncode != 0:
+                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4e119fc9a4..962b75336b 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -6,6 +6,7 @@
 import numpy as np
 import numpy.ctypeslib as npc
 import copy
+import re
 import warnings
 import concurrent.futures
 import threading
@@ -1019,6 +1020,9 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
 class MultiModelGraph:
     def __init__(self, graphs):
         self.graphs = graphs
+        self.project_name = re.sub(r'_graph\d+$', '_stitched', graphs[0].config.get_project_name())
+        self.output_dir = graphs[0].config.get_output_dir().split('/')[0]
+        self.backend = self.graphs[0].config.backend
 
     def build(self, max_workers=None, **kwargs):
         # Build all ModelGraph instances in parallel.
@@ -1085,6 +1089,9 @@ def trace(self, x):
             trace_output.append(curr_trace_output)
         return output_data, trace_output
     
+    def stitch_design(self, **kwargs):
+        self.backend.stitch_design(self.output_dir, self.project_name, **kwargs)
+        
     def _print_status(self, status):
         # Clear the terminal line and print build status
         print('\r', end='')
diff --git a/scripts/build_graphs.tcl b/scripts/ip_stitcher.tcl
similarity index 86%
rename from scripts/build_graphs.tcl
rename to scripts/ip_stitcher.tcl
index 553dd5e6b4..706869656f 100644
--- a/scripts/build_graphs.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -39,7 +39,7 @@ puts "###########################################################"
 
 
 # Create New Vivado Project
-set project_name "vivado_final_graph"
+set project_name "vivado_stitched_design"
 file mkdir $project_name
 cd $project_name
 create_project $project_name . -part $part
@@ -274,6 +274,7 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
         # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
         if {[string length $ap_done_pin] > 0 && [string length $ap_start_pin] > 0} {
             connect_bd_net $ap_done_pin $ap_start_pin
+            puts "Connected 'ap_done' of $ip_i to 'ap_start' of $ip_i_plus1"
         } else {
             puts "Warning: Could not find 'ap_done' or 'ap_start' pin for IPs $ip_i and $ip_i_plus1"
         }
@@ -281,8 +282,6 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
         # Get AXI Stream interface pins from ip_i and ip_i_plus1
         set ip_i_intf_pins [get_bd_intf_pins -of $ip_i_cell]
         set ip_i_plus1_intf_pins [get_bd_intf_pins -of $ip_i_plus1_cell]
-
-        # Initialize variables
         set ip_i_axis_master ""
         set ip_i_plus1_axis_slave ""
 
@@ -307,7 +306,7 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
         }
 
         # Check if both interfaces are found
-        if {[string length $ip_i_axis_master] && [string length $ip_i_plus1_axis_slave]} {
+        if {[string length $ip_i_axis_master] > 0 && [string length $ip_i_plus1_axis_slave] > 0} {
             # Connect the AXI Stream interfaces
             connect_bd_intf_net $ip_i_axis_master $ip_i_plus1_axis_slave
             puts "Connected AXI Stream interface between $ip_i and $ip_i_plus1"
@@ -319,6 +318,7 @@ for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
 
 if {$interface_type == "axi_stream"} {
     # Create external port for 'ap_start' and connect all 'ap_start' pins
+    # ap_start in streaming IPs needs to be constantly high
     if {[llength $ap_start_ports] > 0} {
         create_bd_port -dir I ap_start
         set ap_start_port [get_bd_ports ap_start]
@@ -347,7 +347,7 @@ if {$interface_type == "axi_stream"} {
         make_bd_intf_pins_external $first_ip_axis_slave
         # Retrieve the external interface port
         set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-        # Change name to base_name and associate clock
+        # Change name to base_name
         set_property NAME $pin_name $external_intf_port
         set input_pin_name $pin_name
     } else {
@@ -374,9 +374,8 @@ if {$interface_type == "axi_stream"} {
     if {[string length $last_ip_axis_master] > 0} {
         # Make the interface pin external
         make_bd_intf_pins_external $last_ip_axis_master
-        # Retrieve the external interface port
+        # Retrieve the external interface port and change name to base name
         set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-        # Change name to base_name and associate clock
         set_property NAME $pin_name $external_intf_port
         set output_pin_name $pin_name
     } else {
@@ -384,8 +383,9 @@ if {$interface_type == "axi_stream"} {
         return
     }
 
-    # associate input and output bus interfaces to run at ap_clk
+    # associate input, output and ap_rst to run at 'ap_clk'
     set_property CONFIG.ASSOCIATED_BUSIF [list "${input_pin_name}:${output_pin_name}"] [get_bd_ports /ap_clk]
+    set_property CONFIG.ASSOCIATED_RESET {ap_rst} [get_bd_ports /ap_clk]
     
     # Make external the 'ap_done' signal of the last IP
     set last_ip_pins [get_bd_pins -of $last_ip_cell]
@@ -444,47 +444,32 @@ if {$interface_type == "axi_stream"} {
         puts "Warning: Could not find 'ap_done' pin for last IP"
     }
 
-    # Make external the inputs of the first IP (including 'vld' signals)
-    set first_ip_input_ports [get_bd_pins -of $first_ip_cell]
-    foreach pin $first_ip_input_ports {
-         set pin_name [get_property NAME $pin]
-         # Match patterns for inputs and input valid pins
-         if {[regexp {^\w+_input_(\d+)$} $pin_name] || [regexp {^\w+_input_(\d+)_ap_vld$} $pin_name]} {
-             # Get pin properties
-             set pin_dir [get_property DIR $pin]
-             set pin_left [get_property LEFT $pin]
-             set pin_right [get_property RIGHT $pin]
-             set pin_type [get_property TYPE $pin]
-             if {$pin_left ne "" && $pin_right ne ""} {
-                 # Create an external port with the same name, bit range and type
-                 set ext_port [create_bd_port -dir $pin_dir -from $pin_left -to $pin_right -type $pin_type $pin_name]
-             } else {
-                 # For single-bit signals where LEFT and RIGHT may not be defined
-                 set ext_port [create_bd_port -dir $pin_dir -type $pin_type $pin_name]
-             }
-             connect_bd_net $ext_port $pin
-         }
+
+    # Make external the input of the first IP (including 'vld' signals)
+    set first_ip_pins [get_bd_pins -of $first_ip_cell]
+    foreach pin $first_ip_pins {
+        set pin_name [get_property NAME $pin]
+        # Match patterns for inputs and input valid pins
+        if {[regexp {^\w+_input_(\d+)$} $pin_name] || [regexp {^\w+_input_(\d+)_ap_vld$} $pin_name]} {
+            # Make the pin external
+            make_bd_pins_external $pin
+            # Retrieve the external port and change name to base name
+            set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
+            set_property NAME $pin_name $external_port
+        }
     }
 
-    # Make external the outputs of the last IP (including 'vld' signals)
-    set last_ip_output_ports [get_bd_pins -of $last_ip_cell]
-    foreach pin $last_ip_output_ports {
+    # Make external the output of the last IP (including 'vld' signals)
+    set last_ip_pins [get_bd_pins -of $last_ip_cell]
+    foreach pin $last_ip_pins {
         set pin_name [get_property NAME $pin]
-         # Match patterns for ouputs and output valid pins
+        # Match patterns for inputs and input valid pins
         if {[regexp {^layer(?:\d+_)?out_(\d+)$} $pin_name] || [regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $pin_name]} {
-             # Get pin properties
-             set pin_dir [get_property DIR $pin]
-             set pin_left [get_property LEFT $pin]
-             set pin_right [get_property RIGHT $pin]
-             set pin_type [get_property TYPE $pin]
-             if {$pin_left ne "" && $pin_right ne ""} {
-                 # Create an external port with the same name, bit range and type
-                 set ext_port [create_bd_port -dir $pin_dir -from $pin_left -to $pin_right -type $pin_type $pin_name]
-             } else {
-                 # For single-bit signals where LEFT and RIGHT may not be defined
-                 set ext_port [create_bd_port -dir $pin_dir -type $pin_type $pin_name]
-             }
-             connect_bd_net $ext_port $pin
+            # Make the pin external
+            make_bd_pins_external $pin
+            # Retrieve the external port and change name to base name
+            set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
+            set_property NAME $pin_name $external_port
         }
     }
 }

From bba704bb3431b7528f8c1711147b570b5f7ff34f Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 18 Nov 2024 17:22:40 +0100
Subject: [PATCH 14/50] fix in tcl. folder creation for stitch project

---
 hls4ml/backends/vitis/vitis_backend.py | 6 ++++--
 scripts/ip_stitcher.tcl                | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index d62442e4a2..c38396b8b2 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -131,13 +131,15 @@ def build(
     
     def stitch_design(self, output_dir, project_name):
         os.makedirs(output_dir, exist_ok=True)
+        vivado_stitched_dir = os.path.join(output_dir, 'vivado_stitched_design')
+        os.makedirs(vivado_stitched_dir, exist_ok=True)
 
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
         stitch_command = 'vivado -mode batch -nojournal -nolog -notrace -source ' + hls4ml_path + '/../scripts/ip_stitcher.tcl'
 
-        stdout_log = os.path.join(output_dir, 'stitcher_stdout.log')
-        stderr_log = os.path.join(output_dir, 'stitcher_stderr.log')
+        stdout_log = os.path.join(vivado_stitched_dir, 'stitcher_stdout.log')
+        stderr_log = os.path.join(vivado_stitched_dir, 'stitcher_stderr.log')
         
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
             # Use subprocess.Popen to capture output
diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index 706869656f..9161c5568b 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -337,7 +337,7 @@ if {$interface_type == "axi_stream"} {
     set first_ip_axis_slave ""
     foreach intf_pin $first_ip_intf_pins {
         set pin_name [get_property NAME $intf_pin]
-        if {[string match "*s_axis*" $pin_name] || [string match "*input*" $pin_name]} {
+        if {[string match "*s_axis*" $pin_name] || [string match "*inp*" $pin_name]} {
             set first_ip_axis_slave $intf_pin
             break
         }
@@ -450,7 +450,7 @@ if {$interface_type == "axi_stream"} {
     foreach pin $first_ip_pins {
         set pin_name [get_property NAME $pin]
         # Match patterns for inputs and input valid pins
-        if {[regexp {^\w+_input_(\d+)$} $pin_name] || [regexp {^\w+_input_(\d+)_ap_vld$} $pin_name]} {
+        if {[regexp {^\w+_(input|inp|layer)(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
             # Make the pin external
             make_bd_pins_external $pin
             # Retrieve the external port and change name to base name
@@ -464,7 +464,7 @@ if {$interface_type == "axi_stream"} {
     foreach pin $last_ip_pins {
         set pin_name [get_property NAME $pin]
         # Match patterns for inputs and input valid pins
-        if {[regexp {^layer(?:\d+_)?out_(\d+)$} $pin_name] || [regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $pin_name]} {
+        if {[regexp {^layer(?:\d+_)?out(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
             # Make the pin external
             make_bd_pins_external $pin
             # Retrieve the external port and change name to base name

From da3efb05e296af03ff075d881b197a5301203d0a Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 22 Nov 2024 17:58:04 +0100
Subject: [PATCH 15/50] package final stitched ip in hls4ml

Notes:
* missing X_INTERFACE_INFO for axi interfaces in the generated HDL during packaging
* Vivado throws warning : Misformed interface info
* We ommit this warning at the moment, as IP can still be packaged
---
 hls4ml/backends/vitis/vitis_backend.py |  6 ++---
 hls4ml/model/graph.py                  |  4 ++--
 scripts/ip_stitcher.tcl                | 33 ++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index c38396b8b2..34c4d4a7d0 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -129,15 +129,15 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name):
+    def stitch_design(self, output_dir, project_name, export = False):
         os.makedirs(output_dir, exist_ok=True)
         vivado_stitched_dir = os.path.join(output_dir, 'vivado_stitched_design')
         os.makedirs(vivado_stitched_dir, exist_ok=True)
 
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
-        stitch_command = 'vivado -mode batch -nojournal -nolog -notrace -source ' + hls4ml_path + '/../scripts/ip_stitcher.tcl'
-
+        stitch_flags = ' -tclargs export_design' if export else ''
+        stitch_command = 'vivado -mode batch -nojournal -nolog -notrace -source ' + hls4ml_path + '/../scripts/ip_stitcher.tcl' + stitch_flags
         stdout_log = os.path.join(vivado_stitched_dir, 'stitcher_stdout.log')
         stderr_log = os.path.join(vivado_stitched_dir, 'stitcher_stderr.log')
         
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 962b75336b..fe7384d946 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1089,8 +1089,8 @@ def trace(self, x):
             trace_output.append(curr_trace_output)
         return output_data, trace_output
     
-    def stitch_design(self, **kwargs):
-        self.backend.stitch_design(self.output_dir, self.project_name, **kwargs)
+    def stitch_design(self, export = False, **kwargs):
+        self.backend.stitch_design(self.output_dir, self.project_name, export = export, **kwargs)
         
     def _print_status(self, status):
         # Clear the terminal line and print build status
diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index 9161c5568b..194ca083ea 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -123,12 +123,16 @@ if {[llength $ap_rst_ports] > 0} {
     set sample_rst_pin [lindex $ap_rst_ports 0]
     set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
     # Create the 'ap_rst' port
-    create_bd_port -dir I -type rst ap_rst
+    set rst_port_name "ap_rst"
+    create_bd_port -dir I -type rst $rst_port_name
     set ap_rst_port [get_bd_ports ap_rst]
     
     # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
     if {$rst_polarity ne ""} {
         set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
+        # naming convention for active-low signals
+        set rst_port_name "ap_rst_n"
+        set_property NAME $rst_port_name $ap_rst_port
     } else {
         # Fallback to ACTIVE_HIGH if the retrieved polarity is not defined
         set_property CONFIG.POLARITY ACTIVE_HIGH $ap_rst_port
@@ -385,7 +389,7 @@ if {$interface_type == "axi_stream"} {
 
     # associate input, output and ap_rst to run at 'ap_clk'
     set_property CONFIG.ASSOCIATED_BUSIF [list "${input_pin_name}:${output_pin_name}"] [get_bd_ports /ap_clk]
-    set_property CONFIG.ASSOCIATED_RESET {ap_rst} [get_bd_ports /ap_clk]
+    set_property CONFIG.ASSOCIATED_RESET $rst_port_name [get_bd_ports /ap_clk]
     
     # Make external the 'ap_done' signal of the last IP
     set last_ip_pins [get_bd_pins -of $last_ip_cell]
@@ -474,15 +478,34 @@ if {$interface_type == "axi_stream"} {
     }
 }
 
-save_bd_design
+validate_bd_design
 
 regenerate_bd_layout
-close_project
 
+save_bd_design
 
 puts "###########################################################"                                     
 puts "#   Successfully connected the ports of each IP instance   "
-puts "#   from '[lindex $ip_instances 0]' to '[lindex $ip_instances [expr {$repo_count - 1}]]'."
 puts "#   A total of $repo_count IPs were connected.             "
 puts "###########################################################"
 
+if { [lsearch -exact $argv "export_design"] >= 0 } {
+    puts "Exporting the final stitched IP..."
+    set stitched_ip_dir "ip_repo"
+    ipx::package_project -root_dir $stitched_ip_dir \
+        -vendor user.org -library user -taxonomy /UserIP -module $bd_name \
+        -import_files
+    set_property description "This IP core integrates all NN subgraph IPs into one." [ipx::find_open_core user.org:user:stitched_design:1.0]   
+    set_property core_revision 2 [ipx::find_open_core user.org:user:stitched_design:1.0]
+    ipx::create_xgui_files [ipx::find_open_core user.org:user:stitched_design:1.0]
+    ipx::update_checksums [ipx::find_open_core user.org:user:stitched_design:1.0]
+    ipx::check_integrity [ipx::find_open_core user.org:user:stitched_design:1.0]
+    ipx::save_core [ipx::find_open_core user.org:user:stitched_design:1.0]
+    puts "Stitched IP has been exported to '$stitched_ip_dir' folder"
+} 
+
+close_project
+
+
+
+

From 0f40e2aeac73b7e951728ccd46c1e83862c10ae4 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 2 Dec 2024 17:21:10 +0100
Subject: [PATCH 16/50] support for multiple inputs/outputs in first/last layer
 of stitched ip

---
 scripts/ip_stitcher.tcl | 86 +++++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index 194ca083ea..e4fcfbf6c2 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -331,64 +331,59 @@ if {$interface_type == "axi_stream"} {
         }
     }
 
-    # Make external the input interface of the first IP
+    # Make external all input interfaces of the first IP
     set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
     if {[string length $first_ip_cell] == 0} {
         puts "Error: Could not find the first IP cell."
         return
     }
     set first_ip_intf_pins [get_bd_intf_pins -of $first_ip_cell]
-    set first_ip_axis_slave ""
+    set input_pin_names {}
     foreach intf_pin $first_ip_intf_pins {
         set pin_name [get_property NAME $intf_pin]
         if {[string match "*s_axis*" $pin_name] || [string match "*inp*" $pin_name]} {
-            set first_ip_axis_slave $intf_pin
-            break
+            # Make the interface pin external
+            make_bd_intf_pins_external $intf_pin
+            # Retrieve the external interface port
+            set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+            # Change name to base_name
+            set_property NAME $pin_name $external_intf_port
+            lappend input_pin_names $pin_name
         }
     }
-    if {[string length $first_ip_axis_slave] > 0} {
-        # Make the interface pin external
-        make_bd_intf_pins_external $first_ip_axis_slave
-        # Retrieve the external interface port
-        set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-        # Change name to base_name
-        set_property NAME $pin_name $external_intf_port
-        set input_pin_name $pin_name
-    } else {
-        puts "Error: Could not find input AXI Stream interface for first IP."
+    if {[llength $input_pin_names] == 0} {
+        puts "Error: Could not find any input AXI Stream interfaces for first IP."
         return
     }
 
-
-    # Make external the output interface of the last IP
+    # Make external all output interfaces of the last IP
     set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
     if {[string length $last_ip_cell] == 0} {
         puts "Error: Could not find the last IP cell."
         return
     }
     set last_ip_intf_pins [get_bd_intf_pins -of $last_ip_cell]
-    set last_ip_axis_master ""
+    set output_pin_names {}
     foreach intf_pin $last_ip_intf_pins {
         set pin_name [get_property NAME $intf_pin]
-          if {[string match "*m_axis*" $pin_name] || [string match "*out*" $pin_name]} {
-            set last_ip_axis_master $intf_pin
-            break
+        if {[string match "*m_axis*" $pin_name] || [string match "*out*" $pin_name]} {
+            # Make the interface pin external
+            make_bd_intf_pins_external $intf_pin
+            # Retrieve the external interface port and change name to base name
+            set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+            set_property NAME $pin_name $external_intf_port
+            lappend output_pin_names $pin_name
         }
     }
-    if {[string length $last_ip_axis_master] > 0} {
-        # Make the interface pin external
-        make_bd_intf_pins_external $last_ip_axis_master
-        # Retrieve the external interface port and change name to base name
-        set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-        set_property NAME $pin_name $external_intf_port
-        set output_pin_name $pin_name
-    } else {
-        puts "Error: Could not find output AXI Stream interface for last IP."
+    if {[llength $output_pin_names] == 0} {
+        puts "Error: Could not find any output AXI Stream interfaces for last IP."
         return
     }
 
-    # associate input, output and ap_rst to run at 'ap_clk'
-    set_property CONFIG.ASSOCIATED_BUSIF [list "${input_pin_name}:${output_pin_name}"] [get_bd_ports /ap_clk]
+    # Associate input, output, and ap_rst to run at 'ap_clk'
+    # Join interface names with colons to match the required format
+    set associated_busif [join [concat $input_pin_names $output_pin_names] ":"]
+    set_property CONFIG.ASSOCIATED_BUSIF {$associated_busif} [get_bd_ports /ap_clk]
     set_property CONFIG.ASSOCIATED_RESET $rst_port_name [get_bd_ports /ap_clk]
     
     # Make external the 'ap_done' signal of the last IP
@@ -412,6 +407,10 @@ if {$interface_type == "axi_stream"} {
 } elseif {$interface_type == "unpacked"} {
     # Make 'ap_start' of the first IP external
     set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+    if {[string length $first_ip_cell] == 0} {
+        puts "Error: Could not find the first IP cell."
+        return
+    }
     set first_ip_pins [get_bd_pins -of $first_ip_cell]
     set first_ap_start_pin ""
     foreach pin $first_ip_pins {
@@ -431,6 +430,10 @@ if {$interface_type == "axi_stream"} {
 
     # Make 'ap_done' of the last IP external
     set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+    if {[string length $last_ip_cell] == 0} {
+        puts "Error: Could not find the last IP cell."
+        return
+    }
     set last_ip_pins [get_bd_pins -of $last_ip_cell]
     set last_ap_done_pin ""
     foreach pin $last_ip_pins {
@@ -448,9 +451,8 @@ if {$interface_type == "axi_stream"} {
         puts "Warning: Could not find 'ap_done' pin for last IP"
     }
 
-
-    # Make external the input of the first IP (including 'vld' signals)
-    set first_ip_pins [get_bd_pins -of $first_ip_cell]
+    # Make external all inputs of the first IP (including 'vld' signals)
+    set input_pin_names {}
     foreach pin $first_ip_pins {
         set pin_name [get_property NAME $pin]
         # Match patterns for inputs and input valid pins
@@ -460,22 +462,32 @@ if {$interface_type == "axi_stream"} {
             # Retrieve the external port and change name to base name
             set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
             set_property NAME $pin_name $external_port
+            lappend input_pin_names $pin_name
         }
     }
+    if {[llength $input_pin_names] == 0} {
+        puts "Error: Could not find any input pins for first IP."
+        return
+    }
 
-    # Make external the output of the last IP (including 'vld' signals)
-    set last_ip_pins [get_bd_pins -of $last_ip_cell]
+    # Make external all outputs of the last IP (including 'vld' signals)
+    set output_pin_names {}
     foreach pin $last_ip_pins {
         set pin_name [get_property NAME $pin]
-        # Match patterns for inputs and input valid pins
+        # Match patterns for outputs and output valid pins
         if {[regexp {^layer(?:\d+_)?out(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
             # Make the pin external
             make_bd_pins_external $pin
             # Retrieve the external port and change name to base name
             set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
             set_property NAME $pin_name $external_port
+            lappend output_pin_names $pin_name
         }
     }
+    if {[llength $output_pin_names] == 0} {
+        puts "Error: Could not find any output pins for last IP."
+        return
+    }
 }
 
 validate_bd_design

From d24c42bdb97735b6ea2a11d5965ed294f85f0773 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 3 Dec 2024 16:04:24 +0100
Subject: [PATCH 17/50] initial support for stitched ip simulation

---
 hls4ml/backends/vitis/vitis_backend.py |  23 ++-
 hls4ml/converters/keras_to_hls.py      |  10 +-
 hls4ml/model/graph.py                  |  14 +-
 hls4ml/report/vivado_report.py         | 198 +++++++++++++++++++++++++
 scripts/ip_stitcher.tcl                |  49 +++++-
 5 files changed, 277 insertions(+), 17 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 34c4d4a7d0..462ada2307 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -6,6 +6,7 @@
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
 from hls4ml.report import parse_vivado_report
+from hls4ml.report import parse_xml_and_write_testbench
 
 
 class VitisBackend(VivadoBackend):
@@ -129,15 +130,27 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name, export = False):
+    def stitch_design(self, output_dir, project_name, sim_design = False, export = False):
         os.makedirs(output_dir, exist_ok=True)
         vivado_stitched_dir = os.path.join(output_dir, 'vivado_stitched_design')
         os.makedirs(vivado_stitched_dir, exist_ok=True)
 
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
-        stitch_flags = ' -tclargs export_design' if export else ''
-        stitch_command = 'vivado -mode batch -nojournal -nolog -notrace -source ' + hls4ml_path + '/../scripts/ip_stitcher.tcl' + stitch_flags
+        
+        # TODO fix verilog generator and output path 
+        sim_verilog_file = parse_xml_and_write_testbench(vivado_stitched_dir)
+        
+        # Build the command as a list
+        stitch_command = [
+            'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
+            '-source', os.path.join(hls4ml_path, '../scripts/ip_stitcher.tcl'),
+            '-tclargs',  # Add this line
+            f'sim_design={int(sim_design)}',
+            f'export_design={int(export)}',
+            f'sim_verilog_file={sim_verilog_file}'
+        ]
+                
         stdout_log = os.path.join(vivado_stitched_dir, 'stitcher_stdout.log')
         stderr_log = os.path.join(vivado_stitched_dir, 'stitcher_stderr.log')
         
@@ -145,11 +158,11 @@ def stitch_design(self, output_dir, project_name, export = False):
             # Use subprocess.Popen to capture output
             process = subprocess.Popen(
                 stitch_command,
-                shell=True,
                 cwd=output_dir,
                 stdout=stdout_file,
                 stderr=stderr_file,
-                text=True
+                text=True,
+                shell=False
             )
             process.communicate()
             if process.returncode != 0:
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 237bfd33b2..a65e0be81c 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -326,17 +326,15 @@ def keras_to_hls(config, split_layer_names = []):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, output_shapes = parse_keras_model(model_arch, reader)
     
+    merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     print('Creating HLS model...')
     if split_layer_names:
-        if all(name.startswith('fc') or name.startswith('dense') or 
-               name.startswith('conv') or
-               name.startswith('activation') or name.startswith('relu') 
-               for name in split_layer_names):
+        if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
+            raise ValueError(f"Split layer must not be a merge layer")
+        else:
             hls_models = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
             print('Multi-graph HLS model created.')
             return hls_models
-        else:
-            raise ValueError(f"Split layer must be either dense or fc layers")
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
         print('HLS model created.')
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index fe7384d946..db0a91af8d 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -322,7 +322,7 @@ class ModelGraph:
         outputs (list, optional):  The outputs to the model. If None, determined from layer_list
     """
 
-    def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=0):
+    def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=0): #, output_vars={}):
         self.config = HLSConfig(config)
 
         # keep track of the applied flows
@@ -343,7 +343,7 @@ def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=
 
         self.index = initial_index
         self.graph = OrderedDict()  # where the nodes are stored
-        self.output_vars = {}
+        #self.output_vars = output_vars
 
         self._top_function_lib = None
 
@@ -943,6 +943,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         original_OutputDir = config['OutputDir']
         original_ProjectName = config['ProjectName']
         current_index = 0
+        #curr_output_vars = {}
         last_output_precision = None
         for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
             
@@ -1011,7 +1012,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             if layer_indices:
                 max_index = max(layer_indices)
                 current_index = max_index - 1 # we have the input layer as well
-            
+            #curr_output_vars = hls_model.output_vars
             model_graphs.append(hls_model)
 
         return MultiModelGraph(model_graphs)
@@ -1023,6 +1024,9 @@ def __init__(self, graphs):
         self.project_name = re.sub(r'_graph\d+$', '_stitched', graphs[0].config.get_project_name())
         self.output_dir = graphs[0].config.get_output_dir().split('/')[0]
         self.backend = self.graphs[0].config.backend
+    
+    def __getitem__(self, index):
+        return self.graphs[index]
 
     def build(self, max_workers=None, **kwargs):
         # Build all ModelGraph instances in parallel.
@@ -1089,8 +1093,8 @@ def trace(self, x):
             trace_output.append(curr_trace_output)
         return output_data, trace_output
     
-    def stitch_design(self, export = False, **kwargs):
-        self.backend.stitch_design(self.output_dir, self.project_name, export = export, **kwargs)
+    def stitch_design(self, sim_design = False, export = False, **kwargs):
+        self.backend.stitch_design(self.output_dir, self.project_name, sim_design = sim_design, export = export, **kwargs)
         
     def _print_status(self, status):
         # Clear the terminal line and print build status
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index d63d729fdc..8eec339cc5 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -2,6 +2,7 @@
 import re
 import sys
 import xml.etree.ElementTree as ET
+from lxml import etree
 
 
 def read_vivado_report(hls_dir, full_report=False):
@@ -672,3 +673,200 @@ def _make_report_body(report_dict, make_table_template, make_header_template):
         body = body.format(**params)
 
     return body
+
+def parse_xml_and_write_testbench(vivado_base_folder = None):
+    
+    component_xml_path = os.path.join(vivado_base_folder, 'ip_repo/component.xml')
+    if not os.path.exists(component_xml_path):
+        raise FileNotFoundError(f"component.xml not found at {component_xml_path}")
+    
+    # Parse the XML file
+    tree = etree.parse('model_2graphs/vivado_stitched_design/ip_repo/component.xml')
+    root = tree.getroot()
+
+    # Define the namespaces
+    ns = {
+        'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
+        'xilinx': 'http://www.xilinx.com',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+    }
+
+    # Extract ports
+    ports = root.findall('.//spirit:model/spirit:ports/spirit:port', namespaces=ns)
+
+    inputs = []
+    outputs = []
+
+    for port in ports:
+        name = port.find('spirit:name', namespaces=ns).text
+        wire = port.find('spirit:wire', namespaces=ns)
+        if wire is not None:
+            direction = wire.find('spirit:direction', namespaces=ns).text
+            vector = wire.find('spirit:vector', namespaces=ns)
+            if vector is not None:
+                left = vector.find('spirit:left', namespaces=ns).text
+                right = vector.find('spirit:right', namespaces=ns).text
+                width = abs(int(left) - int(right)) + 1
+            else:
+                width = 1
+            port_info = {'name': name, 'direction': direction, 'width': width}
+            if direction == 'in':
+                inputs.append(port_info)
+            elif direction == 'out':
+                outputs.append(port_info)
+
+    # Generate testbench code
+    testbench_code = '`timescale 1ns / 1ps\n\n'
+    testbench_code += 'module tb_design_1_wrapper;\n\n'
+
+    # Generate signal declarations
+    # Clock and reset signals
+    clock_signal = None
+    reset_signal = None
+    signal_declarations = ''
+
+    for port in inputs + outputs:
+        width_str = f'[{port["width"]-1}:0] ' if port['width'] > 1 else ''
+        if port['direction'] == 'in':
+            if 'clk' in port['name'].lower():
+                clock_signal = port['name']
+            if 'rst' in port['name'].lower():
+                reset_signal = port['name']
+            signal_declarations += f'    reg {width_str}{port["name"]};\n'
+        else:
+            signal_declarations += f'    wire {width_str}{port["name"]};\n'
+
+    testbench_code += '// Signal Declarations\n'
+    testbench_code += signal_declarations + '\n'
+
+    # Instantiate the DUT
+    dut_instantiation = '    // Instantiate the Design Under Test (DUT)\n'
+    dut_instantiation += '    stitched_design dut (\n'
+
+    port_connections = []
+    for port in inputs + outputs:
+        port_connections.append(f"        .{port['name']}({port['name']})")
+    dut_instantiation += ',\n'.join(port_connections)
+    dut_instantiation += '\n    );\n\n'
+
+    testbench_code += dut_instantiation
+
+    # Clock generation
+    clock_logic = ''
+    if clock_signal:
+        clock_logic += f'    // Clock Generation (100 MHz)\n'
+        clock_logic += f'    initial begin\n'
+        clock_logic += f'        {clock_signal} = 0;\n'
+        clock_logic += f'        forever #5 {clock_signal} = ~{clock_signal}; // Clock period of 10 ns\n'
+        clock_logic += f'    end\n\n'
+
+    testbench_code += clock_logic
+
+    # Reset generation
+    reset_logic = ''
+    if reset_signal:
+        reset_logic += f'    // Reset Generation\n'
+        reset_logic += f'    initial begin\n'
+        reset_logic += f'        {reset_signal} = 0;\n'
+        reset_logic += f'        repeat (5) @(posedge {clock_signal});\n'
+        reset_logic += f'        {reset_signal} = 1;\n'
+        reset_logic += f'    end\n\n'
+
+    testbench_code += reset_logic
+
+    # Control signals initialization
+    control_signals_init = '    // Control Signals Initialization\n'
+    control_signals_init += '    initial begin\n'
+
+    # Initialize control signals
+    for port in inputs:
+        if port['name'] != clock_signal and port['name'] != reset_signal:
+            control_signals_init += f'        {port["name"]} = 0;\n'
+
+    # Set tready signals to 1
+    for port in inputs:
+        if 'tready' in port['name'].lower():
+            control_signals_init += f'        {port["name"]} = 1;\n'
+    for port in outputs:
+        if 'tready' in port['name'].lower():
+            control_signals_init += f'        {port["name"]} = 1;\n'
+
+    control_signals_init += '    end\n\n'
+    testbench_code += control_signals_init
+
+    # Input stimulus
+    input_stimulus = '    // Input Stimulus\n'
+    input_stimulus += '    integer i;\n'
+    input_stimulus += '    initial begin\n'
+    if reset_signal:
+        input_stimulus += f'        wait ({reset_signal} == 1);\n'
+    input_stimulus += f'        repeat (2) @(posedge {clock_signal});\n\n'
+
+    ap_start_signal = next((sig['name'] for sig in inputs if 'ap_start' in sig['name']), None)
+    if ap_start_signal:
+        input_stimulus += f'        // Start the operation\n'
+        input_stimulus += f'        {ap_start_signal} = 1;\n\n'
+
+    # Send input data
+    tdata_signal = next((sig['name'] for sig in inputs if 'tdata' in sig['name'].lower()), None)
+    tvalid_signal = next((sig['name'] for sig in inputs if 'tvalid' in sig['name'].lower()), None)
+    tready_signal = next((sig['name'] for sig in outputs if 'tready' in sig['name'].lower()), None)
+
+    if tdata_signal and tvalid_signal and tready_signal:
+        tdata_width = next((sig['width'] for sig in inputs if sig['name'] == tdata_signal), 32)
+        input_stimulus += f'        // Send input data\n'
+        input_stimulus += f'        {tvalid_signal} = 1;\n'
+        input_stimulus += f'        for (i = 0; i < 16; i = i + 1) begin\n'
+        input_stimulus += f'            {tdata_signal} = $random;\n'
+        input_stimulus += f'            while ({tready_signal} == 0) @(posedge {clock_signal});\n'
+        input_stimulus += f'            @(posedge {clock_signal});\n'
+        input_stimulus += f'        end\n'
+        input_stimulus += f'        {tvalid_signal} = 0;\n'
+    input_stimulus += '    end\n\n'
+    testbench_code += input_stimulus
+
+    # Output capture and latency measurement
+    output_capture = '    // Output Capture\n'
+    output_capture += '    integer outfile;\n'
+    output_capture += '    initial begin\n'
+    output_capture += '        outfile = $fopen("output_data.txt", "w");\n'
+    if reset_signal:
+        output_capture += f'        wait ({reset_signal} == 1);\n'
+    output_capture += f'        repeat (2) @(posedge {clock_signal});\n\n'
+    output_capture += '        // Monitor outputs\n'
+    tdata_out_signal = next((sig['name'] for sig in outputs if 'tdata' in sig['name'].lower()), None)
+    tvalid_out_signal = next((sig['name'] for sig in outputs if 'tvalid' in sig['name'].lower()), None)
+    tready_out_signal = next((sig['name'] for sig in inputs if 'tready' in sig['name'].lower()), None)
+
+    if tdata_out_signal and tvalid_out_signal and tready_out_signal:
+        output_capture += '        forever begin\n'
+        output_capture += f'            @(posedge {clock_signal});\n'
+        output_capture += f'            if ({tvalid_out_signal} && {tready_out_signal}) begin\n'
+        output_capture += f'                $fwrite(outfile, "%0d ns: Data = %h\\n", $time, {tdata_out_signal});\n'
+        output_capture += f'            end\n'
+        output_capture += '        end\n'
+    else:
+        output_capture += '        // No output AXI4-Stream signals detected\n'
+    output_capture += '    end\n\n'
+
+    testbench_code += output_capture
+
+    # Finish simulation when ap_done is asserted
+    ap_done_signal = next((sig['name'] for sig in outputs if 'ap_done' in sig['name']), None)
+    if ap_done_signal:
+        testbench_code += f'    // Finish simulation when operation is done\n'
+        testbench_code += f'    initial begin\n'
+        testbench_code += f'        wait ({ap_done_signal} == 1);\n'
+        testbench_code += f'        $fclose(outfile);\n'
+        testbench_code += f'        $finish;\n'
+        testbench_code += f'    end\n\n'
+
+    testbench_code += 'endmodule\n'
+
+    # Write the testbench to a file
+    testbench_file_path = os.path.join(vivado_base_folder, 'testbench.v')
+    with open(testbench_file_path, 'w') as f:
+        f.write(testbench_code)
+    print("Testbench generated successfully.")
+
+    return testbench_file_path
diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index e4fcfbf6c2..306d0a1c52 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -9,6 +9,28 @@
 
 puts "###########################################################"
 
+array set opt {
+    sim_design        0
+    export_design     0
+    sim_verilog_file  ""
+}
+
+foreach arg $::argv {
+    if {[regexp {([^=]+)=(.*)} $arg -> key value]} {
+        if {[info exists opt($key)]} {
+            set opt($key) $value
+        } else {
+            puts "Warning: Unknown option $key"
+        }
+    } else {
+        puts "Warning: Ignoring argument $arg"
+    }
+}
+
+set sim_design [expr {$opt(sim_design)}]
+set export_design [expr {$opt(export_design)}]
+set sim_verilog_file $opt(sim_verilog_file)
+
 # Project base dir
 set base_dir [pwd]
 
@@ -501,7 +523,7 @@ puts "#   Successfully connected the ports of each IP instance   "
 puts "#   A total of $repo_count IPs were connected.             "
 puts "###########################################################"
 
-if { [lsearch -exact $argv "export_design"] >= 0 } {
+if {$export_design} {
     puts "Exporting the final stitched IP..."
     set stitched_ip_dir "ip_repo"
     ipx::package_project -root_dir $stitched_ip_dir \
@@ -516,6 +538,31 @@ if { [lsearch -exact $argv "export_design"] >= 0 } {
     puts "Stitched IP has been exported to '$stitched_ip_dir' folder"
 } 
 
+if {$sim_design} {
+    puts "Adding simulation Verilog file..."
+    if {$sim_verilog_file != ""} {
+        if { [file exists $sim_verilog_file] } {
+            if { [llength [get_filesets sim_1]] == 0 } {
+                create_fileset -simset sim_1
+            }
+            set_property SOURCE_SET sources_1 [get_filesets sim_1]
+            add_files -fileset sim_1 -norecurse -scan_for_includes $sim_verilog_file
+            update_compile_order -fileset sim_1
+            puts "Simulation Verilog file added: $sim_verilog_file"
+            # Set the simulation top module if necessary
+            set_property top tb_design_1_wrapper [get_filesets sim_1]
+            # Run the behavioral simulation
+            set_property -name {xsim.simulate.runtime} -value {200000ns} -objects [get_filesets sim_1]
+            launch_simulation
+        } else {
+            puts "Error: Simulation Verilog file not found: $sim_verilog_file"
+        }
+    } else {
+        puts "Error: sim_verilog_file not provided."
+        exit 1
+    }
+}
+
 close_project
 
 

From 6e8f4628471a6e062904cce791f215e57712c1b0 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 6 Dec 2024 17:45:48 +0100
Subject: [PATCH 18/50] generate verilog testbench for stitched ip

---
 hls4ml/backends/vitis/vitis_backend.py |  33 ++--
 hls4ml/model/graph.py                  |  53 +++++-
 hls4ml/report/vivado_report.py         | 200 +-------------------
 hls4ml/utils/simulation_utils.py       | 251 +++++++++++++++++++++++++
 scripts/ip_stitcher.tcl                |   8 +-
 5 files changed, 322 insertions(+), 223 deletions(-)
 create mode 100644 hls4ml/utils/simulation_utils.py

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 462ada2307..df6e7d2fbc 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -2,11 +2,12 @@
 import sys
 import subprocess
 import importlib.util
+import json
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
 from hls4ml.report import parse_vivado_report
-from hls4ml.report import parse_xml_and_write_testbench
+from hls4ml.utils.simulation_utils import generate_verilog_testbench
 
 
 class VitisBackend(VivadoBackend):
@@ -130,29 +131,37 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name, sim_design = False, export = False):
+    def stitch_design(self, output_dir, project_name, sim_design = False, export = False, nn_config=None):
+
         os.makedirs(output_dir, exist_ok=True)
-        vivado_stitched_dir = os.path.join(output_dir, 'vivado_stitched_design')
-        os.makedirs(vivado_stitched_dir, exist_ok=True)
+        stitched_design_dir = os.path.join(output_dir, 'vivado_stitched_design')
+
+        os.makedirs(stitched_design_dir, exist_ok=True)
 
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
+
+        nn_config_file = os.path.join(stitched_design_dir, "nn_config.json")
+        if nn_config:
+            with open(nn_config_file, "w") as file:
+                json.dump(nn_config, file, indent=4)
         
-        # TODO fix verilog generator and output path 
-        sim_verilog_file = parse_xml_and_write_testbench(vivado_stitched_dir)
-        
+        if(sim_design):
+            testbench_file_path =  os.path.join(stitched_design_dir, "testbench.v")
+            generate_verilog_testbench(nn_config, testbench_file_path)
+
         # Build the command as a list
         stitch_command = [
             'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
             '-source', os.path.join(hls4ml_path, '../scripts/ip_stitcher.tcl'),
-            '-tclargs',  # Add this line
+            '-tclargs',
             f'sim_design={int(sim_design)}',
             f'export_design={int(export)}',
-            f'sim_verilog_file={sim_verilog_file}'
+            f'sim_verilog_file=vivado_stitched_design/testbench.v'
         ]
                 
-        stdout_log = os.path.join(vivado_stitched_dir, 'stitcher_stdout.log')
-        stderr_log = os.path.join(vivado_stitched_dir, 'stitcher_stderr.log')
+        stdout_log = os.path.join(stitched_design_dir, 'stitcher_stdout.log')
+        stderr_log = os.path.join(stitched_design_dir, 'stitcher_stderr.log')
         
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
             # Use subprocess.Popen to capture output
@@ -166,4 +175,4 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
             )
             process.communicate()
             if process.returncode != 0:
-                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
+                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
\ No newline at end of file
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index db0a91af8d..4b0383e616 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -322,7 +322,7 @@ class ModelGraph:
         outputs (list, optional):  The outputs to the model. If None, determined from layer_list
     """
 
-    def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=0): #, output_vars={}):
+    def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=0):
         self.config = HLSConfig(config)
 
         # keep track of the applied flows
@@ -343,7 +343,7 @@ def __init__(self, config, layer_list, inputs=None, outputs=None, initial_index=
 
         self.index = initial_index
         self.graph = OrderedDict()  # where the nodes are stored
-        #self.output_vars = output_vars
+        self.output_vars = {}
 
         self._top_function_lib = None
 
@@ -1027,6 +1027,41 @@ def __init__(self, graphs):
     
     def __getitem__(self, index):
         return self.graphs[index]
+    
+    def parse_nn_config(self):
+        nn_config = {"inputs": [], "outputs": []}
+
+        # Parse layers (inputs and outputs)
+        for graph, io_type in [(self.graphs[0], "inputs"), (self.graphs[-1], "outputs")]:
+            for layer in getattr(graph, io_type):
+                if layer in graph.output_vars:
+                    total_bits = 1
+                    [total_bits := total_bits * num for num in graph.output_vars[layer].shape]
+                    pragma = graph.output_vars[layer].pragma
+                    if isinstance(pragma, str):
+                        layer_pragma = pragma  # 'reshape' or 'partition' pragma
+                        fifo_depth = 1
+                    elif isinstance(pragma, (list, tuple)) and len(pragma) == 2:
+                        layer_pragma = pragma[0]  # 'stream' pragma
+                        fifo_depth = pragma[1]
+                    else:
+                        raise ValueError(f"Unexpected format for pragma: {pragma}")
+                    if total_bits % fifo_depth != 0:
+                        raise ValueError(f"Division of total_bits by fifo_depth does not result in a remainder of zero.")
+                    batch_size = total_bits // fifo_depth
+                    precision = graph.output_vars[layer].type.precision
+                    nn_config[io_type].append({
+                        "name": graph.output_vars[layer].name,
+                        "pragma": layer_pragma,
+                        "integer_bits": int(precision.integer),
+                        "fractional_bits": int(precision.fractional),
+                        "signed": int(precision.signed),
+                        "fifo_depth": int(fifo_depth),
+                        "batch_size": int(batch_size)
+                    })
+
+        return nn_config
+            
 
     def build(self, max_workers=None, **kwargs):
         # Build all ModelGraph instances in parallel.
@@ -1074,30 +1109,32 @@ def compile(self):
             g.compile()
 
     def predict(self, x):
-        # Pass the data through each ModelGraph in sequence
         input_data = x
         for g in self.graphs:
-            # Predict with the current ModelGraph
             output_data = g.predict(input_data)
             input_data = output_data
         return output_data
 
     def trace(self, x):
-        # Pass the data through each ModelGraph in sequence
         input_data = x
         trace_output = []
         for g in self.graphs:
-            # Trace with the current ModelGraph
             output_data, curr_trace_output = g.trace(input_data)
             input_data = output_data
             trace_output.append(curr_trace_output)
         return output_data, trace_output
     
     def stitch_design(self, sim_design = False, export = False, **kwargs):
-        self.backend.stitch_design(self.output_dir, self.project_name, sim_design = sim_design, export = export, **kwargs)
+        nn_config = self.parse_nn_config()
+        self.backend.stitch_design(
+            output_dir=self.output_dir,
+            project_name=self.project_name,
+            sim_design=sim_design,
+            export=export,
+            nn_config=nn_config,
+            **kwargs)
         
     def _print_status(self, status):
-        # Clear the terminal line and print build status
         print('\r', end='')
         status_icons = {
             'Pending': '○',
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 8eec339cc5..f854dc60b8 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -2,7 +2,6 @@
 import re
 import sys
 import xml.etree.ElementTree as ET
-from lxml import etree
 
 
 def read_vivado_report(hls_dir, full_report=False):
@@ -672,201 +671,4 @@ def _make_report_body(report_dict, make_table_template, make_header_template):
 
         body = body.format(**params)
 
-    return body
-
-def parse_xml_and_write_testbench(vivado_base_folder = None):
-    
-    component_xml_path = os.path.join(vivado_base_folder, 'ip_repo/component.xml')
-    if not os.path.exists(component_xml_path):
-        raise FileNotFoundError(f"component.xml not found at {component_xml_path}")
-    
-    # Parse the XML file
-    tree = etree.parse('model_2graphs/vivado_stitched_design/ip_repo/component.xml')
-    root = tree.getroot()
-
-    # Define the namespaces
-    ns = {
-        'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
-        'xilinx': 'http://www.xilinx.com',
-        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
-    }
-
-    # Extract ports
-    ports = root.findall('.//spirit:model/spirit:ports/spirit:port', namespaces=ns)
-
-    inputs = []
-    outputs = []
-
-    for port in ports:
-        name = port.find('spirit:name', namespaces=ns).text
-        wire = port.find('spirit:wire', namespaces=ns)
-        if wire is not None:
-            direction = wire.find('spirit:direction', namespaces=ns).text
-            vector = wire.find('spirit:vector', namespaces=ns)
-            if vector is not None:
-                left = vector.find('spirit:left', namespaces=ns).text
-                right = vector.find('spirit:right', namespaces=ns).text
-                width = abs(int(left) - int(right)) + 1
-            else:
-                width = 1
-            port_info = {'name': name, 'direction': direction, 'width': width}
-            if direction == 'in':
-                inputs.append(port_info)
-            elif direction == 'out':
-                outputs.append(port_info)
-
-    # Generate testbench code
-    testbench_code = '`timescale 1ns / 1ps\n\n'
-    testbench_code += 'module tb_design_1_wrapper;\n\n'
-
-    # Generate signal declarations
-    # Clock and reset signals
-    clock_signal = None
-    reset_signal = None
-    signal_declarations = ''
-
-    for port in inputs + outputs:
-        width_str = f'[{port["width"]-1}:0] ' if port['width'] > 1 else ''
-        if port['direction'] == 'in':
-            if 'clk' in port['name'].lower():
-                clock_signal = port['name']
-            if 'rst' in port['name'].lower():
-                reset_signal = port['name']
-            signal_declarations += f'    reg {width_str}{port["name"]};\n'
-        else:
-            signal_declarations += f'    wire {width_str}{port["name"]};\n'
-
-    testbench_code += '// Signal Declarations\n'
-    testbench_code += signal_declarations + '\n'
-
-    # Instantiate the DUT
-    dut_instantiation = '    // Instantiate the Design Under Test (DUT)\n'
-    dut_instantiation += '    stitched_design dut (\n'
-
-    port_connections = []
-    for port in inputs + outputs:
-        port_connections.append(f"        .{port['name']}({port['name']})")
-    dut_instantiation += ',\n'.join(port_connections)
-    dut_instantiation += '\n    );\n\n'
-
-    testbench_code += dut_instantiation
-
-    # Clock generation
-    clock_logic = ''
-    if clock_signal:
-        clock_logic += f'    // Clock Generation (100 MHz)\n'
-        clock_logic += f'    initial begin\n'
-        clock_logic += f'        {clock_signal} = 0;\n'
-        clock_logic += f'        forever #5 {clock_signal} = ~{clock_signal}; // Clock period of 10 ns\n'
-        clock_logic += f'    end\n\n'
-
-    testbench_code += clock_logic
-
-    # Reset generation
-    reset_logic = ''
-    if reset_signal:
-        reset_logic += f'    // Reset Generation\n'
-        reset_logic += f'    initial begin\n'
-        reset_logic += f'        {reset_signal} = 0;\n'
-        reset_logic += f'        repeat (5) @(posedge {clock_signal});\n'
-        reset_logic += f'        {reset_signal} = 1;\n'
-        reset_logic += f'    end\n\n'
-
-    testbench_code += reset_logic
-
-    # Control signals initialization
-    control_signals_init = '    // Control Signals Initialization\n'
-    control_signals_init += '    initial begin\n'
-
-    # Initialize control signals
-    for port in inputs:
-        if port['name'] != clock_signal and port['name'] != reset_signal:
-            control_signals_init += f'        {port["name"]} = 0;\n'
-
-    # Set tready signals to 1
-    for port in inputs:
-        if 'tready' in port['name'].lower():
-            control_signals_init += f'        {port["name"]} = 1;\n'
-    for port in outputs:
-        if 'tready' in port['name'].lower():
-            control_signals_init += f'        {port["name"]} = 1;\n'
-
-    control_signals_init += '    end\n\n'
-    testbench_code += control_signals_init
-
-    # Input stimulus
-    input_stimulus = '    // Input Stimulus\n'
-    input_stimulus += '    integer i;\n'
-    input_stimulus += '    initial begin\n'
-    if reset_signal:
-        input_stimulus += f'        wait ({reset_signal} == 1);\n'
-    input_stimulus += f'        repeat (2) @(posedge {clock_signal});\n\n'
-
-    ap_start_signal = next((sig['name'] for sig in inputs if 'ap_start' in sig['name']), None)
-    if ap_start_signal:
-        input_stimulus += f'        // Start the operation\n'
-        input_stimulus += f'        {ap_start_signal} = 1;\n\n'
-
-    # Send input data
-    tdata_signal = next((sig['name'] for sig in inputs if 'tdata' in sig['name'].lower()), None)
-    tvalid_signal = next((sig['name'] for sig in inputs if 'tvalid' in sig['name'].lower()), None)
-    tready_signal = next((sig['name'] for sig in outputs if 'tready' in sig['name'].lower()), None)
-
-    if tdata_signal and tvalid_signal and tready_signal:
-        tdata_width = next((sig['width'] for sig in inputs if sig['name'] == tdata_signal), 32)
-        input_stimulus += f'        // Send input data\n'
-        input_stimulus += f'        {tvalid_signal} = 1;\n'
-        input_stimulus += f'        for (i = 0; i < 16; i = i + 1) begin\n'
-        input_stimulus += f'            {tdata_signal} = $random;\n'
-        input_stimulus += f'            while ({tready_signal} == 0) @(posedge {clock_signal});\n'
-        input_stimulus += f'            @(posedge {clock_signal});\n'
-        input_stimulus += f'        end\n'
-        input_stimulus += f'        {tvalid_signal} = 0;\n'
-    input_stimulus += '    end\n\n'
-    testbench_code += input_stimulus
-
-    # Output capture and latency measurement
-    output_capture = '    // Output Capture\n'
-    output_capture += '    integer outfile;\n'
-    output_capture += '    initial begin\n'
-    output_capture += '        outfile = $fopen("output_data.txt", "w");\n'
-    if reset_signal:
-        output_capture += f'        wait ({reset_signal} == 1);\n'
-    output_capture += f'        repeat (2) @(posedge {clock_signal});\n\n'
-    output_capture += '        // Monitor outputs\n'
-    tdata_out_signal = next((sig['name'] for sig in outputs if 'tdata' in sig['name'].lower()), None)
-    tvalid_out_signal = next((sig['name'] for sig in outputs if 'tvalid' in sig['name'].lower()), None)
-    tready_out_signal = next((sig['name'] for sig in inputs if 'tready' in sig['name'].lower()), None)
-
-    if tdata_out_signal and tvalid_out_signal and tready_out_signal:
-        output_capture += '        forever begin\n'
-        output_capture += f'            @(posedge {clock_signal});\n'
-        output_capture += f'            if ({tvalid_out_signal} && {tready_out_signal}) begin\n'
-        output_capture += f'                $fwrite(outfile, "%0d ns: Data = %h\\n", $time, {tdata_out_signal});\n'
-        output_capture += f'            end\n'
-        output_capture += '        end\n'
-    else:
-        output_capture += '        // No output AXI4-Stream signals detected\n'
-    output_capture += '    end\n\n'
-
-    testbench_code += output_capture
-
-    # Finish simulation when ap_done is asserted
-    ap_done_signal = next((sig['name'] for sig in outputs if 'ap_done' in sig['name']), None)
-    if ap_done_signal:
-        testbench_code += f'    // Finish simulation when operation is done\n'
-        testbench_code += f'    initial begin\n'
-        testbench_code += f'        wait ({ap_done_signal} == 1);\n'
-        testbench_code += f'        $fclose(outfile);\n'
-        testbench_code += f'        $finish;\n'
-        testbench_code += f'    end\n\n'
-
-    testbench_code += 'endmodule\n'
-
-    # Write the testbench to a file
-    testbench_file_path = os.path.join(vivado_base_folder, 'testbench.v')
-    with open(testbench_file_path, 'w') as f:
-        f.write(testbench_code)
-    print("Testbench generated successfully.")
-
-    return testbench_file_path
+    return body
\ No newline at end of file
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
new file mode 100644
index 0000000000..1c742a5493
--- /dev/null
+++ b/hls4ml/utils/simulation_utils.py
@@ -0,0 +1,251 @@
+import os
+from lxml import etree
+
+def parse_component_xml(component_xml_path):
+    """
+    Parse the given component.xml file and return structured information
+    about the input and output ports.
+
+    Returns:
+        inputs (list): A list of dicts, each containing 'name', 'direction', and 'width' for input ports.
+        outputs (list): A list of dicts, each containing 'name', 'direction', and 'width' for output ports.
+    """
+    if not os.path.exists(component_xml_path):
+        raise FileNotFoundError(f"component.xml not found at {component_xml_path}")
+
+    # Parse the XML file
+    tree = etree.parse(component_xml_path)
+    root = tree.getroot()
+
+    # Define the namespaces
+    ns = {
+        'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
+        'xilinx': 'http://www.xilinx.com',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+    }
+
+    # Extract ports
+    ports = root.findall('.//spirit:model/spirit:ports/spirit:port', namespaces=ns)
+    inputs = []
+    outputs = []
+
+    for port in ports:
+        name = port.find('spirit:name', namespaces=ns).text
+        wire = port.find('spirit:wire', namespaces=ns)
+        if wire is not None:
+            direction = wire.find('spirit:direction', namespaces=ns).text
+            vector = wire.find('spirit:vector', namespaces=ns)
+            if vector is not None:
+                left = vector.find('spirit:left', namespaces=ns).text
+                right = vector.find('spirit:right', namespaces=ns).text
+                width = abs(int(left) - int(right)) + 1
+            else:
+                width = 1
+
+            port_info = {'name': name, 'direction': direction, 'width': width}
+            if direction == 'in':
+                inputs.append(port_info)
+            elif direction == 'out':
+                outputs.append(port_info)
+
+    return inputs, outputs
+
+
+def generate_verilog_testbench(nn_config, testbench_output_path):
+    inputs = nn_config['inputs']
+    outputs = nn_config['outputs']
+
+    input_signals = []
+    output_signals = []
+
+    for input_item in inputs:
+        total_bits = input_item['integer_bits'] + input_item['fractional_bits']
+        input_signals.append((input_item['name'], total_bits))
+
+    for output_item in outputs:
+        total_bits = output_item['integer_bits'] + output_item['fractional_bits']
+        output_signals.append((output_item['name'], total_bits))
+
+    with open(testbench_output_path, 'w') as f:
+        # Write the initial part of the testbench
+        f.write('`timescale 1ns / 1ps\n\n')
+        f.write('module tb_design_1_wrapper;\n\n')
+        f.write('    // Clock and Reset Signals\n')
+        f.write('    reg ap_clk;\n')
+        f.write('    reg ap_rst_n;\n\n')
+        f.write('    // Control Signals\n')
+        f.write('    reg ap_start;\n')
+        f.write('    wire ap_done;\n\n')
+
+        # Generate AXI4-Stream interface signals for inputs
+        for layer in nn_config['inputs']:
+            total_bits = layer['integer_bits'] + layer['fractional_bits']
+            f.write(f'    reg [{(total_bits * layer["batch_size"]) - 1}:0] {layer["name"]}_tdata;\n')
+            f.write(f'    reg {layer["name"]}_tvalid;\n')
+            f.write(f'    wire {layer["name"]}_tready;\n\n')
+
+        # Generate AXI4-Stream interface signals for outputs
+        for layer in nn_config['outputs']:
+            total_bits = layer['integer_bits'] + layer['fractional_bits']
+            f.write(f'    wire [{(total_bits * layer["batch_size"]) - 1}:0] {layer["name"]}_tdata;\n')
+            f.write(f'    wire {layer["name"]}_tvalid;\n')
+            f.write(f'    reg {layer["name"]}_tready;\n\n')
+
+        # Instantiate the DUT
+        f.write('    // Instantiate the Design Under Test (DUT)\n')
+        f.write('    stitched_design dut (\n')
+        f.write('        .ap_clk(ap_clk),\n')
+        f.write('        .ap_done(ap_done),\n')
+        f.write('        .ap_rst_n(ap_rst_n),\n')
+        f.write('        .ap_start(ap_start),\n')
+        # Connect input AXI4-Stream interfaces
+        for layer in nn_config['inputs']:
+            name = layer["name"]
+            f.write(f'        .{name}_tdata({name}_tdata),\n')
+            f.write(f'        .{name}_tready({name}_tready),\n')
+            f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+        # Connect output AXI4-Stream interfaces
+        for layer in nn_config['outputs'][:-1]:
+            name = layer["name"]
+            f.write(f'        .{name}_tdata({name}_tdata),\n')
+            f.write(f'        .{name}_tready({name}_tready),\n')
+            f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+        # Handle the last output layer without a trailing comma
+        last_output_layer = nn_config['outputs'][-1]
+        name = last_output_layer["name"]
+        f.write(f'        .{name}_tdata({name}_tdata),\n')
+        f.write(f'        .{name}_tready({name}_tready),\n')
+        f.write(f'        .{name}_tvalid({name}_tvalid)\n')
+        f.write('    );\n\n')
+
+        # Add clock generation
+        f.write('    // Clock Generation (100 MHz)\n')
+        f.write('    initial begin\n')
+        f.write('        ap_clk = 0;\n')
+        f.write('        forever #5 ap_clk = ~ap_clk; // Clock period of 10 ns\n')
+        f.write('    end\n\n')
+
+        # Reset generation
+        f.write('    // Reset Generation\n')
+        f.write('    initial begin\n')
+        f.write('        ap_rst_n  = 0;\n')
+        f.write('        repeat (5) @(posedge ap_clk);\n')
+        f.write('        ap_rst_n = 1;\n')
+        f.write('    end\n\n')
+
+        # Initialize Control Signals
+        f.write('    // Control Signal Initialization\n')
+        f.write('    initial begin\n')
+        f.write('        ap_start = 0;\n')
+        for name, _ in input_signals:
+            f.write(f'        {name}_tvalid = 0;\n')
+        for name, _ in output_signals:
+            f.write(f'        {name}_tready = 1;\n')
+        f.write('    end\n\n')
+
+        # Cycle counter
+        f.write('    // Cycle counter\n')
+        f.write('    reg [63:0] cycle_count = 0;\n')
+        f.write('    reg [63:0] start_cycle = 0;\n')
+        f.write('    reg [63:0] end_cycle = 0;\n')
+        f.write('    always @(posedge ap_clk) begin\n')
+        f.write('        if (!ap_rst_n)\n')
+        f.write('            cycle_count <= 0;\n')
+        f.write('        else\n')
+        f.write('            cycle_count <= cycle_count + 1;\n')
+        f.write('    end\n\n')
+
+        # Data Transmission
+        f.write('    // Data Transmission\n')
+        f.write('    integer i, j;\n')
+        f.write('    integer total_bits;\n')
+        f.write('    initial begin\n')
+        f.write('        // Wait for reset deassertion\n')
+        f.write('        wait (ap_rst_n == 1);\n')
+        f.write('        repeat (2) @(posedge ap_clk);\n\n')
+
+        f.write('        // Start the operation\n')
+        f.write('        ap_start = 1;\n')
+
+        # First Data Pattern: All Zeros
+        for layer in nn_config['inputs']:
+            f.write(f'        // Sending all zeros for {layer["name"]}\n')
+            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
+            for k in range(layer['batch_size']):
+                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
+                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
+                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 0;\n')
+            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
+            f.write(f'            @(posedge ap_clk);\n')
+            f.write(f'        end\n')
+            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
+
+        # Second Data Pattern: Fixed Value of 1
+        for layer in nn_config['inputs']:
+            f.write(f'        // Sending fixed value 1 for {layer["name"]}\n')
+            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
+            for k in range(layer['batch_size']):
+                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
+                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
+                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 1 << {layer["fractional_bits"]};\n')
+            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
+            f.write(f'            @(posedge ap_clk);\n')
+            f.write(f'        end\n')
+            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
+
+        f.write('        start_cycle = cycle_count;\n\n')
+        # Third Data Pattern: All zeros (this is where we measure cycles)
+        for layer in nn_config['inputs']:
+            f.write(f'        // Sending all zeros for {layer["name"]} (this is where we measure cycles)\n')
+            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
+            for k in range(layer['batch_size']):
+                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
+                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
+                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 0;\n')
+            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
+            f.write(f'            @(posedge ap_clk);\n')
+            f.write(f'        end\n')
+            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
+
+        f.write('        // Wait for operation to complete\n')
+        f.write('        wait (ap_done == 1);\n')
+        f.write('        end_cycle = cycle_count;\n')
+        f.write('        $display("Total cycles from start to done: %0d", end_cycle - start_cycle);\n')
+        f.write('        repeat (5) @(posedge ap_clk);\n')
+        f.write('        $finish;\n')
+        f.write('    end\n\n')
+
+        # Output Handling
+        f.write('    // Output Data Capture\n')
+        f.write('    // Decode and display outputs in fixed-point format\n')
+        for layer in nn_config['outputs']:
+            signed_str = layer.get('signed', 1)
+            i_bits = layer['integer_bits']
+            f_bits = layer['fractional_bits']
+            total_bits = i_bits + f_bits
+            f.write(f'    integer idx;\n')
+            f.write(f'    reg signed [{total_bits-1}:0] fixed_val;\n')
+            f.write(f'    real real_val;\n')
+
+            # We'll add an always block per output to print whenever valid & ready
+            f.write(f'    always @(posedge ap_clk) begin\n')
+            f.write(f'        if ({layer["name"]}_tvalid && {layer["name"]}_tready) begin\n')
+            # For simplicity, assume batch_size = 1 here. If you have multiple batch elements, you'd need to loop.
+            # If batch_size > 1, we would display each slice separately.
+            f.write(f'            for (idx = 0; idx < {layer["batch_size"]}; idx = idx + 1) begin\n')
+            f.write(f'                fixed_val = {layer["name"]}_tdata[(idx+1)*{total_bits}-1 -: {total_bits}];\n')
+            # If signed, sign-extend was already done due to reg signed
+            # Convert to real by dividing by 2^(fractional_bits)
+            f.write(f'                real_val = fixed_val / (1.0 * (1 << {f_bits}));\n')
+            f.write(f'                $display("Output {layer["name"]}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx, {i_bits}, {f_bits}, real_val);\n')
+            f.write(f'            end\n')
+            f.write('        end\n')
+            f.write('    end\n\n')
+
+        f.write('endmodule\n')
\ No newline at end of file
diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index 306d0a1c52..99cb580100 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -541,21 +541,21 @@ if {$export_design} {
 if {$sim_design} {
     puts "Adding simulation Verilog file..."
     if {$sim_verilog_file != ""} {
-        if { [file exists $sim_verilog_file] } {
+        if { [file exists "$base_dir/$sim_verilog_file"] } {
             if { [llength [get_filesets sim_1]] == 0 } {
                 create_fileset -simset sim_1
             }
             set_property SOURCE_SET sources_1 [get_filesets sim_1]
-            add_files -fileset sim_1 -norecurse -scan_for_includes $sim_verilog_file
+            add_files -fileset sim_1 -norecurse -scan_for_includes "$base_dir/$sim_verilog_file"
             update_compile_order -fileset sim_1
-            puts "Simulation Verilog file added: $sim_verilog_file"
+            puts "Simulation Verilog file added: $base_dir/$sim_verilog_file"
             # Set the simulation top module if necessary
             set_property top tb_design_1_wrapper [get_filesets sim_1]
             # Run the behavioral simulation
             set_property -name {xsim.simulate.runtime} -value {200000ns} -objects [get_filesets sim_1]
             launch_simulation
         } else {
-            puts "Error: Simulation Verilog file not found: $sim_verilog_file"
+            puts "Error: Simulation Verilog file not found: $base_dir/$sim_verilog_file"
         }
     } else {
         puts "Error: sim_verilog_file not provided."

From 27c76b31d2fc6a628bf2d0c5ff3cb3eac1a02bfa Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 9 Dec 2024 15:17:39 +0100
Subject: [PATCH 19/50] read testbench output

---
 hls4ml/backends/vitis/vitis_backend.py | 12 ++--
 hls4ml/utils/simulation_utils.py       | 85 +++++++++++++++++---------
 2 files changed, 63 insertions(+), 34 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index df6e7d2fbc..06852b6b8b 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -7,7 +7,7 @@
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
 from hls4ml.report import parse_vivado_report
-from hls4ml.utils.simulation_utils import generate_verilog_testbench
+from hls4ml.utils.simulation_utils import generate_verilog_testbench, read_testbench_log
 
 
 class VitisBackend(VivadoBackend):
@@ -135,9 +135,7 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
 
         os.makedirs(output_dir, exist_ok=True)
         stitched_design_dir = os.path.join(output_dir, 'vivado_stitched_design')
-
         os.makedirs(stitched_design_dir, exist_ok=True)
-
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
 
@@ -150,7 +148,6 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
             testbench_file_path =  os.path.join(stitched_design_dir, "testbench.v")
             generate_verilog_testbench(nn_config, testbench_file_path)
 
-        # Build the command as a list
         stitch_command = [
             'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
             '-source', os.path.join(hls4ml_path, '../scripts/ip_stitcher.tcl'),
@@ -164,7 +161,6 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
         stderr_log = os.path.join(stitched_design_dir, 'stitcher_stderr.log')
         
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
-            # Use subprocess.Popen to capture output
             process = subprocess.Popen(
                 stitch_command,
                 cwd=output_dir,
@@ -175,4 +171,8 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
             )
             process.communicate()
             if process.returncode != 0:
-                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
\ No newline at end of file
+                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
+        
+        if(sim_design):
+            testbench_logfile_path = os.path.join(stitched_design_dir, 'vivado_stitched_design.sim/sim_1/behav/xsim/testbench_log.csv')
+            read_testbench_log(testbench_file_path)
\ No newline at end of file
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 1c742a5493..268635e338 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -1,5 +1,8 @@
 import os
 from lxml import etree
+import json
+import numpy as np
+import pandas as pd 
 
 def parse_component_xml(component_xml_path):
     """
@@ -135,6 +138,9 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
 
         # Initialize Control Signals
         f.write('    // Control Signal Initialization\n')
+        f.write('    integer csv_file;\n')
+        f.write('    integer j;\n')
+        f.write('    integer total_bits;\n')
         f.write('    initial begin\n')
         f.write('        ap_start = 0;\n')
         for name, _ in input_signals:
@@ -157,21 +163,25 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
 
         # Data Transmission
         f.write('    // Data Transmission\n')
-        f.write('    integer i, j;\n')
-        f.write('    integer total_bits;\n')
         f.write('    initial begin\n')
         f.write('        // Wait for reset deassertion\n')
         f.write('        wait (ap_rst_n == 1);\n')
         f.write('        repeat (2) @(posedge ap_clk);\n\n')
 
         f.write('        // Start the operation\n')
+        f.write('        csv_file = $fopen("testbench_log.csv", "w");\n')
+        f.write('        if (csv_file == 0) begin\n')
+        f.write('            $display("ERROR: Could not open csv log file.");\n')
+        f.write('             $finish;\n')
+        f.write('        end\n')
+        f.write('        $fwrite(csv_file, "output_name,index,value\\n");\n\n')
         f.write('        ap_start = 1;\n')
 
         # First Data Pattern: All Zeros
         for layer in nn_config['inputs']:
             f.write(f'        // Sending all zeros for {layer["name"]}\n')
-            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
                 upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
@@ -185,8 +195,8 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         # Second Data Pattern: Fixed Value of 1
         for layer in nn_config['inputs']:
             f.write(f'        // Sending fixed value 1 for {layer["name"]}\n')
-            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
                 upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
@@ -198,11 +208,11 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
 
         f.write('        start_cycle = cycle_count;\n\n')
-        # Third Data Pattern: All zeros (this is where we measure cycles)
+        # Third Data Pattern: All zeros (here measure output and cycles)
         for layer in nn_config['inputs']:
-            f.write(f'        // Sending all zeros for {layer["name"]} (this is where we measure cycles)\n')
-            f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n\n')
+            f.write(f'        // Sending all zeros for {layer["name"]} (here we measure output and cycles)\n')
+            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
+            f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
                 upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
@@ -217,35 +227,54 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         f.write('        wait (ap_done == 1);\n')
         f.write('        end_cycle = cycle_count;\n')
         f.write('        $display("Total cycles from start to done: %0d", end_cycle - start_cycle);\n')
-        f.write('        repeat (5) @(posedge ap_clk);\n')
+        f.write('        // Write latency to JSON\n')
+        f.write('        $fwrite(csv_file, "latency_cycles,0,%d\\n", end_cycle - start_cycle);\n')
+        f.write('        repeat (2) @(posedge ap_clk);\n')
+        f.write('        $fclose(csv_file);\n')
         f.write('        $finish;\n')
         f.write('    end\n\n')
 
         # Output Handling
         f.write('    // Output Data Capture\n')
-        f.write('    // Decode and display outputs in fixed-point format\n')
-        for layer in nn_config['outputs']:
+        for i, layer in enumerate(nn_config['outputs']):
             signed_str = layer.get('signed', 1)
             i_bits = layer['integer_bits']
             f_bits = layer['fractional_bits']
             total_bits = i_bits + f_bits
-            f.write(f'    integer idx;\n')
-            f.write(f'    reg signed [{total_bits-1}:0] fixed_val;\n')
-            f.write(f'    real real_val;\n')
+            layer_name = layer["name"]
+
+            f.write(f'    integer idx_{i};\n')
+            f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
+            f.write(f'    real real_val_{i};\n')
 
-            # We'll add an always block per output to print whenever valid & ready
             f.write(f'    always @(posedge ap_clk) begin\n')
-            f.write(f'        if ({layer["name"]}_tvalid && {layer["name"]}_tready) begin\n')
-            # For simplicity, assume batch_size = 1 here. If you have multiple batch elements, you'd need to loop.
-            # If batch_size > 1, we would display each slice separately.
-            f.write(f'            for (idx = 0; idx < {layer["batch_size"]}; idx = idx + 1) begin\n')
-            f.write(f'                fixed_val = {layer["name"]}_tdata[(idx+1)*{total_bits}-1 -: {total_bits}];\n')
-            # If signed, sign-extend was already done due to reg signed
-            # Convert to real by dividing by 2^(fractional_bits)
-            f.write(f'                real_val = fixed_val / (1.0 * (1 << {f_bits}));\n')
-            f.write(f'                $display("Output {layer["name"]}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx, {i_bits}, {f_bits}, real_val);\n')
-            f.write(f'            end\n')
+            f.write(f'        if ({layer_name}_tvalid && {layer_name}_tready) begin\n')
+            f.write(f'            for (idx_{i} = 0; idx_{i} < {layer["batch_size"]}; idx_{i} = idx_{i} + 1) begin\n')
+            f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
+            f.write(f'                real_val_{i} = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
+            f.write(f'                $display("Output {layer["name"]}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
+            f.write('                // Write to csv file\n')
+            f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
+            f.write('            end\n')
             f.write('        end\n')
             f.write('    end\n\n')
 
-        f.write('endmodule\n')
\ No newline at end of file
+        f.write('endmodule\n')
+
+
+def read_testbench_log(testbench_file):
+    # Read the CSV and print it in a numpy-structure 
+
+    if not os.path.exists(testbench_file):
+        print(f"Error: The file '{testbench_file}' does not exist.")
+        return
+    df = pd.read_csv(testbench_file) 
+    latency = df[df['output_name'] == 'latency_cycles']['value'].iloc[0] 
+    grouped = df[df['output_name'] != 'latency_cycles'].groupby('output_name') 
+    for name, group in grouped: 
+        indices = group['index'].astype(int) 
+        values = group['value'] 
+        array = np.zeros(max(indices) + 1) 
+        array[indices] = values
+        print(f"{name}:\n{array}\n") 
+    print(f"Latency (cycles): {int(latency)}")
\ No newline at end of file

From 704a874158ad7cb9040154961e8a96e69e5a9e92 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 10 Dec 2024 17:29:52 +0100
Subject: [PATCH 20/50] minor changes

---
 hls4ml/backends/vitis/vitis_backend.py |  6 ++++--
 hls4ml/converters/keras_to_hls.py      | 15 +++++++--------
 hls4ml/model/graph.py                  | 15 +++++----------
 hls4ml/utils/simulation_utils.py       | 21 ++++++++++++---------
 scripts/ip_stitcher.tcl                |  7 ++++---
 5 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 06852b6b8b..be68aa1d18 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -147,7 +147,9 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
         if(sim_design):
             testbench_file_path =  os.path.join(stitched_design_dir, "testbench.v")
             generate_verilog_testbench(nn_config, testbench_file_path)
+            print('Verilog testbench generated.')
 
+        print('Running build process of stitched IP...\n') 
         stitch_command = [
             'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
             '-source', os.path.join(hls4ml_path, '../scripts/ip_stitcher.tcl'),
@@ -174,5 +176,5 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
                 raise Exception(f'Stitching failed for {project_name}. See logs for details.')
         
         if(sim_design):
-            testbench_logfile_path = os.path.join(stitched_design_dir, 'vivado_stitched_design.sim/sim_1/behav/xsim/testbench_log.csv')
-            read_testbench_log(testbench_file_path)
\ No newline at end of file
+            testbench_log_path = os.path.join(stitched_design_dir, 'vivado_stitched_design.sim/sim_1/behav/xsim/testbench_log.csv')
+            read_testbench_log(testbench_log_path)
\ No newline at end of file
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index a65e0be81c..c9dd749296 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -322,20 +322,19 @@ def parse_keras_model(model_arch, reader):
     return layer_list, input_layers, output_layers, output_shapes
 
 
-def keras_to_hls(config, split_layer_names = []):
+def keras_to_hls(config, split_layer_names = None):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, output_shapes = parse_keras_model(model_arch, reader)
     
-    merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     print('Creating HLS model...')
+    merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     if split_layer_names:
         if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
-            raise ValueError(f"Split layer must not be a merge layer")
-        else:
-            hls_models = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
-            print('Multi-graph HLS model created.')
-            return hls_models
+            raise ValueError(f'Split layer must not be a merge layer')
+        hls_model = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
+        print('Multi-graph HLS model created.')
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
         print('HLS model created.')
-        return hls_model
+
+    return hls_model
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4b0383e616..f3fcb7bfc9 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -912,7 +912,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             split_layer_names (List[str]): The names of the layers to split at.
 
         Returns:
-            List[ModelGraph]: List of ModelGraph instances resulting from the splits.
+            MultiModelGraph: An instance of MultiModelGraph containing the ModelGraphs created from the subgraphs.
         """
         if not split_layer_names:
             raise ValueError("No split layer names provided.")
@@ -924,13 +924,9 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             if name not in layer_names:
                 raise ValueError(f"Layer '{name}' not found in the model.")
 
-        # Get split indices and sort them
+        # Split the layer_list into subgraphs
         split_indices = sorted([layer_names.index(name) for name in split_layer_names])
-
-        # Add start and end indices to cover the entire layer list
         indices = [0] + split_indices + [len(layer_list)]
-
-        # Split the layer_list into subgraphs
         subgraphs_layer_lists = []
         for i in range(len(indices) - 1):
             start = indices[i]
@@ -943,7 +939,6 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         original_OutputDir = config['OutputDir']
         original_ProjectName = config['ProjectName']
         current_index = 0
-        #curr_output_vars = {}
         last_output_precision = None
         for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
             
@@ -1011,8 +1006,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
             layer_indices = [layer.index for layer in hls_model.graph.values()]
             if layer_indices:
                 max_index = max(layer_indices)
-                current_index = max_index - 1 # we have the input layer as well
-            #curr_output_vars = hls_model.output_vars
+                current_index = max_index - 1 # we have the new input layer as well
             model_graphs.append(hls_model)
 
         return MultiModelGraph(model_graphs)
@@ -1114,8 +1108,9 @@ def predict(self, x):
             output_data = g.predict(input_data)
             input_data = output_data
         return output_data
-
+    
     def trace(self, x):
+        # TODO: finish trace function
         input_data = x
         trace_output = []
         for g in self.graphs:
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 268635e338..df15f1a69f 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -180,7 +180,6 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         # First Data Pattern: All Zeros
         for layer in nn_config['inputs']:
             f.write(f'        // Sending all zeros for {layer["name"]}\n')
-            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
             f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
@@ -195,7 +194,6 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         # Second Data Pattern: Fixed Value of 1
         for layer in nn_config['inputs']:
             f.write(f'        // Sending fixed value 1 for {layer["name"]}\n')
-            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
             f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
@@ -211,7 +209,6 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         # Third Data Pattern: All zeros (here measure output and cycles)
         for layer in nn_config['inputs']:
             f.write(f'        // Sending all zeros for {layer["name"]} (here we measure output and cycles)\n')
-            #f.write(f'        total_bits = {layer["integer_bits"] + layer["fractional_bits"]};\n')
             f.write(f'        {layer["name"]}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
             for k in range(layer['batch_size']):
@@ -228,7 +225,7 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         f.write('        end_cycle = cycle_count;\n')
         f.write('        $display("Total cycles from start to done: %0d", end_cycle - start_cycle);\n')
         f.write('        // Write latency to JSON\n')
-        f.write('        $fwrite(csv_file, "latency_cycles,0,%d\\n", end_cycle - start_cycle);\n')
+        f.write('        $fwrite(csv_file, "latency_cycles,0,%0d\\n", end_cycle - start_cycle);\n')
         f.write('        repeat (2) @(posedge ap_clk);\n')
         f.write('        $fclose(csv_file);\n')
         f.write('        $finish;\n')
@@ -262,15 +259,21 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         f.write('endmodule\n')
 
 
-def read_testbench_log(testbench_file):
-    # Read the CSV and print it in a numpy-structure 
+def read_testbench_log(testbench_log_path):
+    """
+    Reads the testbench log file, extracts simulation outputs 
+    and prints each output in numpy-like format along with the latency cycles.
+    """
 
-    if not os.path.exists(testbench_file):
-        print(f"Error: The file '{testbench_file}' does not exist.")
+    if not os.path.exists(testbench_log_path):
+        print(f"Error: The file '{testbench_log_path}' does not exist.")
         return
-    df = pd.read_csv(testbench_file) 
+    
+    df = pd.read_csv(testbench_log_path) 
     latency = df[df['output_name'] == 'latency_cycles']['value'].iloc[0] 
     grouped = df[df['output_name'] != 'latency_cycles'].groupby('output_name') 
+
+    print('=== SIMULATION OUTPUT ===')
     for name, group in grouped: 
         indices = group['index'].astype(int) 
         values = group['value'] 
diff --git a/scripts/ip_stitcher.tcl b/scripts/ip_stitcher.tcl
index 99cb580100..c9665b084e 100644
--- a/scripts/ip_stitcher.tcl
+++ b/scripts/ip_stitcher.tcl
@@ -536,7 +536,7 @@ if {$export_design} {
     ipx::check_integrity [ipx::find_open_core user.org:user:stitched_design:1.0]
     ipx::save_core [ipx::find_open_core user.org:user:stitched_design:1.0]
     puts "Stitched IP has been exported to '$stitched_ip_dir' folder"
-} 
+}
 
 if {$sim_design} {
     puts "Adding simulation Verilog file..."
@@ -549,10 +549,11 @@ if {$sim_design} {
             add_files -fileset sim_1 -norecurse -scan_for_includes "$base_dir/$sim_verilog_file"
             update_compile_order -fileset sim_1
             puts "Simulation Verilog file added: $base_dir/$sim_verilog_file"
-            # Set the simulation top module if necessary
             set_property top tb_design_1_wrapper [get_filesets sim_1]
-            # Run the behavioral simulation
             set_property -name {xsim.simulate.runtime} -value {200000ns} -objects [get_filesets sim_1]
+            puts "##########################"
+            puts "#  Launching simulation  #"
+            puts "##########################"
             launch_simulation
         } else {
             puts "Error: Simulation Verilog file not found: $base_dir/$sim_verilog_file"

From 9d693559d8deb6dc5597fb63913908bebf2780a0 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 11 Dec 2024 17:45:51 +0100
Subject: [PATCH 21/50] =?UTF-8?q?improvements=20in=20testbench=20generatio?=
 =?UTF-8?q?n=20and=20build=20interface=E2=80=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 hls4ml/backends/vitis/vitis_backend.py        | 112 ++++++++++++++++--
 hls4ml/model/graph.py                         |  34 ++++--
 .../templates/vivado}/ip_stitcher.tcl         |   0
 hls4ml/utils/simulation_utils.py              |  37 +++---
 4 files changed, 142 insertions(+), 41 deletions(-)
 rename {scripts => hls4ml/templates/vivado}/ip_stitcher.tcl (100%)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index be68aa1d18..52d9a0a12f 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -3,6 +3,7 @@
 import subprocess
 import importlib.util
 import json
+import shutil
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
@@ -131,31 +132,37 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name, sim_design = False, export = False, nn_config=None):
-
+    def stitch_design(self, output_dir, project_name, sim_stitched_design=False, export_stitched_design=False, nn_config=None, build_results=None):
+        
         os.makedirs(output_dir, exist_ok=True)
         stitched_design_dir = os.path.join(output_dir, 'vivado_stitched_design')
         os.makedirs(stitched_design_dir, exist_ok=True)
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
+        ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')
+        
+        try:
+            shutil.copy(ip_stitcher_path, stitched_design_dir)
+        except Exception as e:
+            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to 'vivado_stitched_design' folder.")
 
-        nn_config_file = os.path.join(stitched_design_dir, "nn_config.json")
+        nn_config_path = os.path.join(stitched_design_dir, "nn_config.json")
         if nn_config:
-            with open(nn_config_file, "w") as file:
+            with open(nn_config_path, "w") as file:
                 json.dump(nn_config, file, indent=4)
         
-        if(sim_design):
-            testbench_file_path =  os.path.join(stitched_design_dir, "testbench.v")
-            generate_verilog_testbench(nn_config, testbench_file_path)
+        if(sim_stitched_design):
+            testbench_path =  os.path.join(stitched_design_dir, "testbench.v")
+            generate_verilog_testbench(nn_config, testbench_path)
             print('Verilog testbench generated.')
 
-        print('Running build process of stitched IP...\n') 
+        print('Running build process of stitched IP...\n')
         stitch_command = [
             'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
-            '-source', os.path.join(hls4ml_path, '../scripts/ip_stitcher.tcl'),
+            '-source', ip_stitcher_path,
             '-tclargs',
-            f'sim_design={int(sim_design)}',
-            f'export_design={int(export)}',
+            f'sim_design={int(sim_stitched_design)}',
+            f'export_design={int(export_stitched_design)}',
             f'sim_verilog_file=vivado_stitched_design/testbench.v'
         ]
                 
@@ -175,6 +182,85 @@ def stitch_design(self, output_dir, project_name, sim_design = False, export = F
             if process.returncode != 0:
                 raise Exception(f'Stitching failed for {project_name}. See logs for details.')
         
-        if(sim_design):
+        stitched_report = self._aggregate_build_results(build_results)
+
+        if(sim_stitched_design):
             testbench_log_path = os.path.join(stitched_design_dir, 'vivado_stitched_design.sim/sim_1/behav/xsim/testbench_log.csv')
-            read_testbench_log(testbench_log_path)
\ No newline at end of file
+            sim_data = read_testbench_log(testbench_log_path)
+            csim_results = []
+            for name, arr in sim_data['outputs'].items():
+                # Convert floats to strings
+                arr_str = [f"{val:.6f}" for val in arr]
+                csim_results.append(arr_str)
+
+            # Add simulation data to stitched report
+            stitched_report['CSimResults'] = csim_results
+            stitched_report['CSynthesisReport']['Stiched_Design_Latency'] = sim_data['latency_cycles']
+
+        return stitched_report
+    
+    def _aggregate_build_results(self, build_results):
+        """
+        Aggregate the resources of each subgraph into a single dictionary.
+        For resources like BRAM_18K, DSP, FF, LUT, URAM we sum them.
+        For timing/latency we picked we sum them.
+        Here we:
+        - Take TargetClockPeriod from the first subgraph.
+        - Take the maximum EstimatedClockPeriod among subgraphs.
+        - Take maximum BestLatency, WorstLatency, IntervalMin, IntervalMax among subgraphs.
+        - Sum the resource fields.
+        """
+
+        if build_results is None or len(build_results) == 0:
+            return {}
+
+        keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM', 'WorstLatency']
+        # Non-resource fields we might want to handle
+        # We'll initialize them from the first subgraph
+        first_subgraph = next(iter(build_results))
+        base_report = build_results[first_subgraph]['CSynthesisReport']
+
+        final_report = {
+            'TargetClockPeriod': base_report.get('TargetClockPeriod', '5.00'),
+            'EstimatedClockPeriod': float(base_report.get('EstimatedClockPeriod', '5.00')),
+            'WorstLatency': int(base_report.get('WorstLatency', '0')),
+        }
+
+        # Initialize resources
+        for k in keys_to_sum:
+            final_report[k] = int(base_report.get(k, '0'))
+
+        # Also include availability fields from the first subgraph 
+        # TODO match actual device resources
+        final_report['AvailableBRAM_18K'] = base_report.get('AvailableBRAM_18K', '5376')
+        final_report['AvailableDSP'] = base_report.get('AvailableDSP', '12288')
+        final_report['AvailableFF'] = base_report.get('AvailableFF', '3456000')
+        final_report['AvailableLUT'] = base_report.get('AvailableLUT', '1728000')
+        final_report['AvailableURAM'] = base_report.get('AvailableURAM', '1280')
+
+        # Aggregate from other subgraphs
+        for subgraph, data in build_results.items():
+            if subgraph == first_subgraph:
+                continue
+            report = data.get('CSynthesisReport', {})
+            # Update non-resource fields
+            est_cp = float(report.get('EstimatedClockPeriod', '5.00'))
+            if est_cp > final_report['EstimatedClockPeriod']:
+                final_report['EstimatedClockPeriod'] = est_cp
+
+            # Take max of these latency fields
+            final_report['WorstLatency'] = max(final_report['WorstLatency'], int(report.get('WorstLatency', '0')))
+
+            # Sum resource fields
+            for k in keys_to_sum:
+                final_report[k] += int(report.get(k, '0'))
+
+        # Convert numbers back to strings
+        final_report['EstimatedClockPeriod'] = f"{final_report['EstimatedClockPeriod']:.3f}"
+        final_report['WorstLatency'] = str(final_report['WorstLatency'])
+
+        for k in keys_to_sum:
+            final_report[k] = str(final_report[k])
+
+        # Return in the desired structure
+        return {'CSynthesisReport': final_report}
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index f3fcb7bfc9..92b73e2252 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1018,6 +1018,7 @@ def __init__(self, graphs):
         self.project_name = re.sub(r'_graph\d+$', '_stitched', graphs[0].config.get_project_name())
         self.output_dir = graphs[0].config.get_output_dir().split('/')[0]
         self.backend = self.graphs[0].config.backend
+        self.build_results = None
     
     def __getitem__(self, index):
         return self.graphs[index]
@@ -1057,8 +1058,16 @@ def parse_nn_config(self):
         return nn_config
             
 
-    def build(self, max_workers=None, **kwargs):
-        # Build all ModelGraph instances in parallel.
+    def build(self, stitch_design=False, sim_stitched_design=False, export_stitched_design=False, max_workers=None, **kwargs):
+        """
+        Builds all ModelGraph instances in parallel, with optional stitching and export.
+        """
+
+        export = kwargs.get('export', False)
+        if (stitch_design or sim_stitched_design or export_stitched_design) and not export:
+            raise ValueError("You can't enable stitch_design, sim_stitched_design, or export_stitched_design without having export=True.")
+        if (sim_stitched_design or export_stitched_design) and not stitch_design:
+            raise ValueError("You can't simulate or export a stitched design without enabling stitch_design.")
         build_results = {}
         total_builds = len(self.graphs)
         status = {}
@@ -1096,6 +1105,17 @@ def build_wrapper(g, **kwargs):
                     build_results[project_name] = result
                 except Exception as exc:
                     build_results[project_name] = None
+
+        if stitch_design:
+            nn_config = self.parse_nn_config()
+            build_results = self.backend.stitch_design(
+                output_dir=self.output_dir,
+                project_name=self.project_name,
+                sim_stitched_design=sim_stitched_design,
+                export_stitched_design=export_stitched_design,
+                nn_config=nn_config,
+                build_results=build_results)
+
         return build_results
 
     def compile(self):
@@ -1118,16 +1138,6 @@ def trace(self, x):
             input_data = output_data
             trace_output.append(curr_trace_output)
         return output_data, trace_output
-    
-    def stitch_design(self, sim_design = False, export = False, **kwargs):
-        nn_config = self.parse_nn_config()
-        self.backend.stitch_design(
-            output_dir=self.output_dir,
-            project_name=self.project_name,
-            sim_design=sim_design,
-            export=export,
-            nn_config=nn_config,
-            **kwargs)
         
     def _print_status(self, status):
         print('\r', end='')
diff --git a/scripts/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
similarity index 100%
rename from scripts/ip_stitcher.tcl
rename to hls4ml/templates/vivado/ip_stitcher.tcl
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index df15f1a69f..ce2dede7d0 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -261,23 +261,28 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
 
 def read_testbench_log(testbench_log_path):
     """
-    Reads the testbench log file, extracts simulation outputs 
-    and prints each output in numpy-like format along with the latency cycles.
+    Reads the testbench log file and returns a dictionary with latency and 
+    output arrays for each output_name.
     """
-
     if not os.path.exists(testbench_log_path):
         print(f"Error: The file '{testbench_log_path}' does not exist.")
-        return
-    
-    df = pd.read_csv(testbench_log_path) 
-    latency = df[df['output_name'] == 'latency_cycles']['value'].iloc[0] 
-    grouped = df[df['output_name'] != 'latency_cycles'].groupby('output_name') 
-
-    print('=== SIMULATION OUTPUT ===')
-    for name, group in grouped: 
-        indices = group['index'].astype(int) 
-        values = group['value'] 
-        array = np.zeros(max(indices) + 1) 
+        return {}
+
+    df = pd.read_csv(testbench_log_path)
+    latency = df[df['output_name'] == 'latency_cycles']['value'].iloc[0]
+    grouped = df[df['output_name'] != 'latency_cycles'].groupby('output_name')
+
+    sim_dict = {
+        'latency_cycles': int(latency),
+        'outputs': {}
+    }
+
+    for name, group in grouped:
+        indices = group['index'].astype(int)
+        values = group['value'].astype(float)
+        array = np.zeros(max(indices) + 1, dtype=float)
         array[indices] = values
-        print(f"{name}:\n{array}\n") 
-    print(f"Latency (cycles): {int(latency)}")
\ No newline at end of file
+        sim_dict['outputs'][name] = array
+
+    return sim_dict
+    

From d1dd0fd0b76abf6ed0f7020888c5f4e944018bcf Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 12 Dec 2024 17:44:14 +0100
Subject: [PATCH 22/50] general improvements

---
 hls4ml/backends/vitis/vitis_backend.py  | 108 +++++-------------------
 hls4ml/model/graph.py                   |  16 ++--
 hls4ml/report/__init__.py               |   1 +
 hls4ml/report/vivado_report.py          |  50 ++++++++++-
 hls4ml/templates/vivado/ip_stitcher.tcl |  15 ++--
 hls4ml/utils/simulation_utils.py        |   4 +-
 6 files changed, 90 insertions(+), 104 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 52d9a0a12f..a1fe71e3a0 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -7,7 +7,7 @@
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
-from hls4ml.report import parse_vivado_report
+from hls4ml.report import parse_vivado_report, aggregate_graph_reports
 from hls4ml.utils.simulation_utils import generate_verilog_testbench, read_testbench_log
 
 
@@ -132,27 +132,30 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name, sim_stitched_design=False, export_stitched_design=False, nn_config=None, build_results=None):
+    def stitch_design(self, output_dir, project_name, sim_stitched_design=False, export_stitched_design=False, nn_config=None, graph_reports=None):
         
         os.makedirs(output_dir, exist_ok=True)
-        stitched_design_dir = os.path.join(output_dir, 'vivado_stitched_design')
-        os.makedirs(stitched_design_dir, exist_ok=True)
+        stitched_design_dir = os.path.join(output_dir, project_name)
+        if os.path.exists(stitched_design_dir):
+            raise FileExistsError(f"The directory '{stitched_design_dir}' already exists.")
+        os.makedirs(stitched_design_dir)
+
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
         ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')
-        
+        nn_config_path = os.path.join(stitched_design_dir, "nn_config.json")
+        testbench_path =  os.path.join(stitched_design_dir, "testbench.v")
+
         try:
             shutil.copy(ip_stitcher_path, stitched_design_dir)
         except Exception as e:
-            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to 'vivado_stitched_design' folder.")
+            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {project_name} folder.")
 
-        nn_config_path = os.path.join(stitched_design_dir, "nn_config.json")
         if nn_config:
             with open(nn_config_path, "w") as file:
                 json.dump(nn_config, file, indent=4)
         
         if(sim_stitched_design):
-            testbench_path =  os.path.join(stitched_design_dir, "testbench.v")
             generate_verilog_testbench(nn_config, testbench_path)
             print('Verilog testbench generated.')
 
@@ -163,7 +166,8 @@ def stitch_design(self, output_dir, project_name, sim_stitched_design=False, exp
             '-tclargs',
             f'sim_design={int(sim_stitched_design)}',
             f'export_design={int(export_stitched_design)}',
-            f'sim_verilog_file=vivado_stitched_design/testbench.v'
+            f'stitch_project_name={project_name}',
+            f'sim_verilog_file={os.path.join(project_name, "testbench.v")}'
         ]
                 
         stdout_log = os.path.join(stitched_design_dir, 'stitcher_stdout.log')
@@ -182,85 +186,17 @@ def stitch_design(self, output_dir, project_name, sim_stitched_design=False, exp
             if process.returncode != 0:
                 raise Exception(f'Stitching failed for {project_name}. See logs for details.')
         
-        stitched_report = self._aggregate_build_results(build_results)
+        stitched_report = aggregate_graph_reports(graph_reports)
 
         if(sim_stitched_design):
-            testbench_log_path = os.path.join(stitched_design_dir, 'vivado_stitched_design.sim/sim_1/behav/xsim/testbench_log.csv')
-            sim_data = read_testbench_log(testbench_log_path)
-            csim_results = []
-            for name, arr in sim_data['outputs'].items():
-                # Convert floats to strings
-                arr_str = [f"{val:.6f}" for val in arr]
-                csim_results.append(arr_str)
+            testbench_log_path = os.path.join(stitched_design_dir, project_name + '.sim/sim_1/behav/xsim/testbench_log.csv')
+            testbench_output = read_testbench_log(testbench_log_path)
 
-            # Add simulation data to stitched report
-            stitched_report['CSimResults'] = csim_results
-            stitched_report['CSynthesisReport']['Stiched_Design_Latency'] = sim_data['latency_cycles']
+            behavioral_sim_results = []
+            for name, arr in testbench_output['outputs'].items():
+                arr_str = [f"{val:.6f}" for val in arr]
+                behavioral_sim_results.append(arr_str)
+            stitched_report['BehavSimResults'] = behavioral_sim_results
+            stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['latency_cycles']
 
         return stitched_report
-    
-    def _aggregate_build_results(self, build_results):
-        """
-        Aggregate the resources of each subgraph into a single dictionary.
-        For resources like BRAM_18K, DSP, FF, LUT, URAM we sum them.
-        For timing/latency we picked we sum them.
-        Here we:
-        - Take TargetClockPeriod from the first subgraph.
-        - Take the maximum EstimatedClockPeriod among subgraphs.
-        - Take maximum BestLatency, WorstLatency, IntervalMin, IntervalMax among subgraphs.
-        - Sum the resource fields.
-        """
-
-        if build_results is None or len(build_results) == 0:
-            return {}
-
-        keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM', 'WorstLatency']
-        # Non-resource fields we might want to handle
-        # We'll initialize them from the first subgraph
-        first_subgraph = next(iter(build_results))
-        base_report = build_results[first_subgraph]['CSynthesisReport']
-
-        final_report = {
-            'TargetClockPeriod': base_report.get('TargetClockPeriod', '5.00'),
-            'EstimatedClockPeriod': float(base_report.get('EstimatedClockPeriod', '5.00')),
-            'WorstLatency': int(base_report.get('WorstLatency', '0')),
-        }
-
-        # Initialize resources
-        for k in keys_to_sum:
-            final_report[k] = int(base_report.get(k, '0'))
-
-        # Also include availability fields from the first subgraph 
-        # TODO match actual device resources
-        final_report['AvailableBRAM_18K'] = base_report.get('AvailableBRAM_18K', '5376')
-        final_report['AvailableDSP'] = base_report.get('AvailableDSP', '12288')
-        final_report['AvailableFF'] = base_report.get('AvailableFF', '3456000')
-        final_report['AvailableLUT'] = base_report.get('AvailableLUT', '1728000')
-        final_report['AvailableURAM'] = base_report.get('AvailableURAM', '1280')
-
-        # Aggregate from other subgraphs
-        for subgraph, data in build_results.items():
-            if subgraph == first_subgraph:
-                continue
-            report = data.get('CSynthesisReport', {})
-            # Update non-resource fields
-            est_cp = float(report.get('EstimatedClockPeriod', '5.00'))
-            if est_cp > final_report['EstimatedClockPeriod']:
-                final_report['EstimatedClockPeriod'] = est_cp
-
-            # Take max of these latency fields
-            final_report['WorstLatency'] = max(final_report['WorstLatency'], int(report.get('WorstLatency', '0')))
-
-            # Sum resource fields
-            for k in keys_to_sum:
-                final_report[k] += int(report.get(k, '0'))
-
-        # Convert numbers back to strings
-        final_report['EstimatedClockPeriod'] = f"{final_report['EstimatedClockPeriod']:.3f}"
-        final_report['WorstLatency'] = str(final_report['WorstLatency'])
-
-        for k in keys_to_sum:
-            final_report[k] = str(final_report[k])
-
-        # Return in the desired structure
-        return {'CSynthesisReport': final_report}
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 92b73e2252..6899890526 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1015,10 +1015,10 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
 class MultiModelGraph:
     def __init__(self, graphs):
         self.graphs = graphs
-        self.project_name = re.sub(r'_graph\d+$', '_stitched', graphs[0].config.get_project_name())
+        self.project_name = 'vivado_stitched_design'
         self.output_dir = graphs[0].config.get_output_dir().split('/')[0]
         self.backend = self.graphs[0].config.backend
-        self.build_results = None
+        self.graph_reports = None
     
     def __getitem__(self, index):
         return self.graphs[index]
@@ -1055,8 +1055,7 @@ def parse_nn_config(self):
                         "batch_size": int(batch_size)
                     })
 
-        return nn_config
-            
+        return nn_config          
 
     def build(self, stitch_design=False, sim_stitched_design=False, export_stitched_design=False, max_workers=None, **kwargs):
         """
@@ -1106,17 +1105,20 @@ def build_wrapper(g, **kwargs):
                 except Exception as exc:
                     build_results[project_name] = None
 
+        self.graph_reports=build_results
+
         if stitch_design:
             nn_config = self.parse_nn_config()
-            build_results = self.backend.stitch_design(
+            stitched_report = self.backend.stitch_design(
                 output_dir=self.output_dir,
                 project_name=self.project_name,
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
                 nn_config=nn_config,
-                build_results=build_results)
+                graph_reports=self.graph_reports)
+            return stitched_report
 
-        return build_results
+        return self.graph_reports
 
     def compile(self):
         for g in self.graphs:
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index 3c9b7707b7..88c21a2289 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -6,3 +6,4 @@
 from hls4ml.report.vivado_report import parse_vivado_report  # noqa: F401
 from hls4ml.report.vivado_report import print_vivado_report  # noqa: F401
 from hls4ml.report.vivado_report import read_vivado_report  # noqa: F401
+from hls4ml.report.vivado_report import aggregate_graph_reports
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index f854dc60b8..2d322adad7 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -671,4 +671,52 @@ def _make_report_body(report_dict, make_table_template, make_header_template):
 
         body = body.format(**params)
 
-    return body
\ No newline at end of file
+    return body
+
+
+def aggregate_graph_reports(graph_reports):
+    """
+    Aggregate the build results of each subgraph into a single dictionary.
+    """
+
+    if graph_reports is None or len(graph_reports) == 0:
+        return {}
+
+    keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM', 'WorstLatency']
+    first_subgraph = next(iter(graph_reports))
+    base_report = graph_reports[first_subgraph]['CSynthesisReport']
+
+    final_report = {
+        'TargetClockPeriod': base_report.get('TargetClockPeriod', 'N/A'),
+        'EstimatedClockPeriod': float(base_report.get('EstimatedClockPeriod', float('inf'))),
+        'WorstLatency': int(base_report.get('WorstLatency', '-1')),
+    }
+
+    for k in keys_to_sum:
+        final_report[k] = int(base_report.get(k, '0'))
+
+    final_report['AvailableBRAM_18K'] = base_report.get('AvailableBRAM_18K', 'N/A')
+    final_report['AvailableDSP'] = base_report.get('AvailableDSP', 'N/A')
+    final_report['AvailableFF'] = base_report.get('AvailableFF', 'N/A')
+    final_report['AvailableLUT'] = base_report.get('AvailableLUT', 'N/A')
+    final_report['AvailableURAM'] = base_report.get('AvailableURAM', 'N/A')
+
+    for subgraph, data in graph_reports.items():
+        if subgraph == first_subgraph:
+            continue
+        report = data.get('CSynthesisReport', {})
+        est_cp = float(report.get('EstimatedClockPeriod', float('inf')))
+        if est_cp > final_report['EstimatedClockPeriod']:
+            final_report['EstimatedClockPeriod'] = est_cp
+
+        final_report['WorstLatency'] = max(final_report['WorstLatency'], int(report.get('WorstLatency', '-1')))
+
+        for k in keys_to_sum:
+            final_report[k] += int(report.get(k, '0'))
+
+    final_report['EstimatedClockPeriod'] = f"{final_report['EstimatedClockPeriod']:.3f}"
+    final_report['WorstLatency'] = str(final_report['WorstLatency'])
+    for k in keys_to_sum:
+        final_report[k] = str(final_report[k])
+
+    return {'StitchedDesignReport': final_report} 
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index c9665b084e..785489243b 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -10,9 +10,10 @@
 puts "###########################################################"
 
 array set opt {
-    sim_design        0
-    export_design     0
-    sim_verilog_file  ""
+    sim_design              0
+    export_design           0
+    stitch_project_name     ""
+    sim_verilog_file        ""
 }
 
 foreach arg $::argv {
@@ -30,6 +31,7 @@ foreach arg $::argv {
 set sim_design [expr {$opt(sim_design)}]
 set export_design [expr {$opt(export_design)}]
 set sim_verilog_file $opt(sim_verilog_file)
+set stitch_project_name $opt(stitch_project_name)
 
 # Project base dir
 set base_dir [pwd]
@@ -61,10 +63,9 @@ puts "###########################################################"
 
 
 # Create New Vivado Project
-set project_name "vivado_stitched_design"
-file mkdir $project_name
-cd $project_name
-create_project $project_name . -part $part
+file mkdir $stitch_project_name
+cd $stitch_project_name
+create_project $stitch_project_name . -part $part
 
 # Add repositories
 # Initialize the repo count
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index ce2dede7d0..fa93843268 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -261,8 +261,7 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
 
 def read_testbench_log(testbench_log_path):
     """
-    Reads the testbench log file and returns a dictionary with latency and 
-    output arrays for each output_name.
+    Reads the testbench log file and returns a dictionary 
     """
     if not os.path.exists(testbench_log_path):
         print(f"Error: The file '{testbench_log_path}' does not exist.")
@@ -285,4 +284,3 @@ def read_testbench_log(testbench_log_path):
         sim_dict['outputs'][name] = array
 
     return sim_dict
-    

From 0bb10df0896ec2d85386d41b25927c5b66b91719 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 17 Dec 2024 17:44:14 +0100
Subject: [PATCH 23/50] only simulate stitched_design, better verilog testbench

---
 hls4ml/backends/vitis/vitis_backend.py  |  20 +-
 hls4ml/model/graph.py                   |  29 +-
 hls4ml/report/vivado_report.py          |   8 +-
 hls4ml/templates/vivado/ip_stitcher.tcl | 824 ++++++++++++------------
 hls4ml/utils/simulation_utils.py        | 332 ++++++----
 5 files changed, 682 insertions(+), 531 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index a1fe71e3a0..9eb157079e 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -132,13 +132,14 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def stitch_design(self, output_dir, project_name, sim_stitched_design=False, export_stitched_design=False, nn_config=None, graph_reports=None):
+    def build_stitched_design(self, output_dir, project_name, stitch_design=True, sim_stitched_design=False, export_stitched_design=False, nn_config=None, graph_reports=None):
         
         os.makedirs(output_dir, exist_ok=True)
         stitched_design_dir = os.path.join(output_dir, project_name)
-        if os.path.exists(stitched_design_dir):
-            raise FileExistsError(f"The directory '{stitched_design_dir}' already exists.")
-        os.makedirs(stitched_design_dir)
+        if stitch_design:
+            if os.path.exists(stitched_design_dir):
+                raise FileExistsError(f"The directory '{stitched_design_dir}' already exists.")
+            os.makedirs(stitched_design_dir)
 
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
@@ -164,6 +165,7 @@ def stitch_design(self, output_dir, project_name, sim_stitched_design=False, exp
             'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
             '-source', ip_stitcher_path,
             '-tclargs',
+            f'stitch_design={int(stitch_design)}',
             f'sim_design={int(sim_stitched_design)}',
             f'export_design={int(export_stitched_design)}',
             f'stitch_project_name={project_name}',
@@ -186,9 +188,11 @@ def stitch_design(self, output_dir, project_name, sim_stitched_design=False, exp
             if process.returncode != 0:
                 raise Exception(f'Stitching failed for {project_name}. See logs for details.')
         
-        stitched_report = aggregate_graph_reports(graph_reports)
+        stitched_report = {}
+        if stitch_design:
+            stitched_report = aggregate_graph_reports(graph_reports)
 
-        if(sim_stitched_design):
+        if sim_stitched_design :
             testbench_log_path = os.path.join(stitched_design_dir, project_name + '.sim/sim_1/behav/xsim/testbench_log.csv')
             testbench_output = read_testbench_log(testbench_log_path)
 
@@ -197,6 +201,8 @@ def stitch_design(self, output_dir, project_name, sim_stitched_design=False, exp
                 arr_str = [f"{val:.6f}" for val in arr]
                 behavioral_sim_results.append(arr_str)
             stitched_report['BehavSimResults'] = behavioral_sim_results
-            stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['latency_cycles']
+            if stitch_design:
+                stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['BestLatency']
+                stitched_report['StitchedDesignReport']['WorstLatency'] = testbench_output['WorstLatency']
 
         return stitched_report
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 6899890526..8544d60a4c 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1107,11 +1107,12 @@ def build_wrapper(g, **kwargs):
 
         self.graph_reports=build_results
 
-        if stitch_design:
+        if stitch_design or sim_stitched_design or export_stitched_design:
             nn_config = self.parse_nn_config()
-            stitched_report = self.backend.stitch_design(
+            stitched_report = self.backend.build_stitched_design(
                 output_dir=self.output_dir,
                 project_name=self.project_name,
+                stitch_design=stitch_design,
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
                 nn_config=nn_config,
@@ -1124,12 +1125,24 @@ def compile(self):
         for g in self.graphs:
             g.compile()
 
-    def predict(self, x):
-        input_data = x
-        for g in self.graphs:
-            output_data = g.predict(input_data)
-            input_data = output_data
-        return output_data
+    def predict(self, x, sim_stitched_design = False):
+        if not sim_stitched_design:
+            input_data = x
+            for g in self.graphs:
+                output_data = g.predict(input_data)
+                input_data = output_data
+            return output_data
+        else:
+            nn_config = self.parse_nn_config()
+            stitched_report = self.backend.build_stitched_design(
+                output_dir=self.output_dir,
+                project_name=self.project_name,
+                stitch_design=False,
+                sim_stitched_design=True,
+                export_stitched_design=False,
+                nn_config=nn_config,
+                graph_reports=self.graph_reports)
+            return stitched_report
     
     def trace(self, x):
         # TODO: finish trace function
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 2d322adad7..43e783f753 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -682,14 +682,15 @@ def aggregate_graph_reports(graph_reports):
     if graph_reports is None or len(graph_reports) == 0:
         return {}
 
-    keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM', 'WorstLatency']
+    keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM']
     first_subgraph = next(iter(graph_reports))
     base_report = graph_reports[first_subgraph]['CSynthesisReport']
 
     final_report = {
         'TargetClockPeriod': base_report.get('TargetClockPeriod', 'N/A'),
         'EstimatedClockPeriod': float(base_report.get('EstimatedClockPeriod', float('inf'))),
-        'WorstLatency': int(base_report.get('WorstLatency', '-1')),
+        'BestLatency': 'N/A',
+        'WorstLatency': 'N/A'
     }
 
     for k in keys_to_sum:
@@ -709,13 +710,10 @@ def aggregate_graph_reports(graph_reports):
         if est_cp > final_report['EstimatedClockPeriod']:
             final_report['EstimatedClockPeriod'] = est_cp
 
-        final_report['WorstLatency'] = max(final_report['WorstLatency'], int(report.get('WorstLatency', '-1')))
-
         for k in keys_to_sum:
             final_report[k] += int(report.get(k, '0'))
 
     final_report['EstimatedClockPeriod'] = f"{final_report['EstimatedClockPeriod']:.3f}"
-    final_report['WorstLatency'] = str(final_report['WorstLatency'])
     for k in keys_to_sum:
         final_report[k] = str(final_report[k])
 
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 785489243b..6612064a6d 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -10,6 +10,7 @@
 puts "###########################################################"
 
 array set opt {
+    stitch_design           1
     sim_design              0
     export_design           0
     stitch_project_name     ""
@@ -28,6 +29,7 @@ foreach arg $::argv {
     }
 }
 
+set stitch_design [expr {$opt(stitch_design)}]
 set sim_design [expr {$opt(sim_design)}]
 set export_design [expr {$opt(export_design)}]
 set sim_verilog_file $opt(sim_verilog_file)
@@ -35,6 +37,8 @@ set stitch_project_name $opt(stitch_project_name)
 
 # Project base dir
 set base_dir [pwd]
+# Name of the block design 
+set bd_name "stitched_design"
 
 # Find a directory that ends with "graph1", "graph2", etc.
 set project_dirs [glob -nocomplain -directory $base_dir *graph[0-9]]
@@ -57,472 +61,488 @@ if {[llength $project_dirs] == 0} {
     }
 }
 
-puts "###########################################################"
-puts "#   Starting the IP connection process...                  "
-puts "###########################################################"
-
-
-# Create New Vivado Project
-file mkdir $stitch_project_name
-cd $stitch_project_name
-create_project $stitch_project_name . -part $part
-
-# Add repositories
-# Initialize the repo count
-set repo_count 0
-# Loop through potential project directories
-for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
-    set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
-    # Check if the repository path exists
-    if {[file isdirectory $repo_path]} {
-        # Add repository path to current project's IP repository paths
-        set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
-        
-        # Increment the repo count
-        incr repo_count
+# Procedure for stitching the project
+proc stitch_procedure {base_dir stitch_project_name bd_name part} {
+
+    puts "###########################################################"
+    puts "#   Starting the IP connection process...                  "
+    puts "###########################################################"
+
+
+    # Create New Vivado Project
+    file mkdir $stitch_project_name
+    cd $stitch_project_name
+    create_project $stitch_project_name . -part $part
+
+    # Add repositories
+    # Initialize the repo count
+    set repo_count 0
+    # Loop through potential project directories
+    for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
+        set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
+        # Check if the repository path exists
+        if {[file isdirectory $repo_path]} {
+            # Add repository path to current project's IP repository paths
+            set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
+            
+            # Increment the repo count
+            incr repo_count
+
+            puts "Added IP repository path: $repo_path"
+        } else {
+            puts "Directory does not exist: $repo_path"
+        }
+    }
 
-        puts "Added IP repository path: $repo_path"
+    if { $repo_count == 0 } {
+        puts "No IP repositories were found in the specified directories."
     } else {
-        puts "Directory does not exist: $repo_path"
+        puts "Total IP repositories added: $repo_count"
     }
-}
+    # Rescan repositories
+    update_ip_catalog
 
-if { $repo_count == 0 } {
-    puts "No IP repositories were found in the specified directories."
-} else {
-    puts "Total IP repositories added: $repo_count"
-}
-# Rescan repositories
-update_ip_catalog
+    create_bd_design $bd_name
 
-# Name of the block design 
-set bd_name "stitched_design"
-create_bd_design $bd_name
+    # Add IPs to block design
+    for {set i 1} {$i <= $repo_count} {incr i} {
+        set vlnv "xilinx.com:hls:myproject_graph$i:1.0"
+        create_bd_cell -type ip -vlnv $vlnv "myproject_graph${i}_0"
+    }
 
-# Add IPs to block design
-for {set i 1} {$i <= $repo_count} {incr i} {
-    set vlnv "xilinx.com:hls:myproject_graph$i:1.0"
-    create_bd_cell -type ip -vlnv $vlnv "myproject_graph${i}_0"
-}
+    # Collect all IP instance names in a list
+    set ip_instances {}
+    for {set i 1} {$i <= $repo_count} {incr i} {
+        set ip_name "myproject_graph${i}_0"
+        lappend ip_instances $ip_name
+    }
 
-# Collect all IP instance names in a list
-set ip_instances {}
-for {set i 1} {$i <= $repo_count} {incr i} {
-    set ip_name "myproject_graph${i}_0"
-    lappend ip_instances $ip_name
-}
+    # Collect 'ap_clk' and 'ap_rst' signals from all IPs
+    set ap_clk_ports {}
+    set ap_rst_ports {}
 
-# Collect 'ap_clk' and 'ap_rst' signals from all IPs
-set ap_clk_ports {}
-set ap_rst_ports {}
-
-foreach ip $ip_instances {
-    set ip_cell [get_bd_cells $ip]
-    set ip_pins [get_bd_pins -of $ip_cell]
-    foreach pin $ip_pins {
-        set pin_name [get_property NAME $pin]
-        if {[string match "ap_clk*" $pin_name]} {
-            lappend ap_clk_ports $pin
-        } elseif {[string match "ap_rst*" $pin_name]} {
-            lappend ap_rst_ports $pin
+    foreach ip $ip_instances {
+        set ip_cell [get_bd_cells $ip]
+        set ip_pins [get_bd_pins -of $ip_cell]
+        foreach pin $ip_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_clk*" $pin_name]} {
+                lappend ap_clk_ports $pin
+            } elseif {[string match "ap_rst*" $pin_name]} {
+                lappend ap_rst_ports $pin
+            }
         }
     }
-}
 
-# Create external ports for 'ap_clk' and 'ap_rst'
-# ap_clk
-if {[llength $ap_clk_ports] > 0} {
-    create_bd_port -dir I -type clk -freq_hz 100000000 ap_clk
-    set ap_clk_port [get_bd_ports ap_clk]
-    # Connect all 'ap_clk' pins to the 'ap_clk' port
-    foreach clk_pin $ap_clk_ports {
-        connect_bd_net $ap_clk_port $clk_pin
+    # Create external ports for 'ap_clk' and 'ap_rst'
+    # ap_clk
+    if {[llength $ap_clk_ports] > 0} {
+        create_bd_port -dir I -type clk -freq_hz 100000000 ap_clk
+        set ap_clk_port [get_bd_ports ap_clk]
+        # Connect all 'ap_clk' pins to the 'ap_clk' port
+        foreach clk_pin $ap_clk_ports {
+            connect_bd_net $ap_clk_port $clk_pin
+        }
     }
-}
 
-# ap_rst
-if {[llength $ap_rst_ports] > 0} {
-    # Get the CONFIG.POLARITY property from one of the IP's 'ap_rst' pins
-    set sample_rst_pin [lindex $ap_rst_ports 0]
-    set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
-    # Create the 'ap_rst' port
-    set rst_port_name "ap_rst"
-    create_bd_port -dir I -type rst $rst_port_name
-    set ap_rst_port [get_bd_ports ap_rst]
-    
-    # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
-    if {$rst_polarity ne ""} {
-        set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
-        # naming convention for active-low signals
-        set rst_port_name "ap_rst_n"
-        set_property NAME $rst_port_name $ap_rst_port
-    } else {
-        # Fallback to ACTIVE_HIGH if the retrieved polarity is not defined
-        set_property CONFIG.POLARITY ACTIVE_HIGH $ap_rst_port
-    }
-    # Connect all 'ap_rst' pins to the 'ap_rst' port
-    foreach rst_pin $ap_rst_ports {
-        connect_bd_net $ap_rst_port $rst_pin
+    # ap_rst
+    if {[llength $ap_rst_ports] > 0} {
+        # Get the CONFIG.POLARITY property from one of the IP's 'ap_rst' pins
+        set sample_rst_pin [lindex $ap_rst_ports 0]
+        set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
+        # Create the 'ap_rst' port
+        set rst_port_name "ap_rst"
+        create_bd_port -dir I -type rst $rst_port_name
+        set ap_rst_port [get_bd_ports ap_rst]
+        
+        # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
+        if {$rst_polarity ne ""} {
+            set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
+            # naming convention for active-low signals
+            set rst_port_name "ap_rst_n"
+            set_property NAME $rst_port_name $ap_rst_port
+        } else {
+            # Fallback to ACTIVE_HIGH if the retrieved polarity is not defined
+            set_property CONFIG.POLARITY ACTIVE_HIGH $ap_rst_port
+        }
+        # Connect all 'ap_rst' pins to the 'ap_rst' port
+        foreach rst_pin $ap_rst_ports {
+            connect_bd_net $ap_rst_port $rst_pin
+        }
     }
-}
 
-# Determine interface type
-set first_ip [lindex $ip_instances 0]
-set first_ip_cell [get_bd_cells $first_ip]
-set first_ip_pins [get_bd_pins -of $first_ip_cell]
-
-set interface_type "unknown"
-foreach port $first_ip_pins {
-    set port_name [get_property NAME $port]
-    if {[string match "*_TDATA" $port_name]} {
-        set interface_type "axi_stream"
-        break
-    } elseif {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name]} {
-        set interface_type "unpacked"
-        break
+    # Determine interface type
+    set first_ip [lindex $ip_instances 0]
+    set first_ip_cell [get_bd_cells $first_ip]
+    set first_ip_pins [get_bd_pins -of $first_ip_cell]
+
+    set interface_type "unknown"
+    foreach port $first_ip_pins {
+        set port_name [get_property NAME $port]
+        if {[string match "*_TDATA" $port_name]} {
+            set interface_type "axi_stream"
+            break
+        } elseif {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name]} {
+            set interface_type "unpacked"
+            break
+        }
     }
-}
 
-if {$interface_type == "unknown"} {
-    puts "Error: Could not determine interface type."
-    exit 1
-} else {
-    puts "Interface type detected: $interface_type"
-}
+    if {$interface_type == "unknown"} {
+        puts "Error: Could not determine interface type."
+        exit 1
+    } else {
+        puts "Interface type detected: $interface_type"
+    }
 
-# Collect 'ap_start' signals from all IPs
-set ap_start_ports {}
-foreach ip $ip_instances {
-    set ip_cell [get_bd_cells $ip]
-    set ip_pins [get_bd_pins -of $ip_cell]
-    foreach pin $ip_pins {
-        set pin_name [get_property NAME $pin]
-        if {[string match "ap_start" $pin_name]} {
-            lappend ap_start_ports $pin
+    # Collect 'ap_start' signals from all IPs
+    set ap_start_ports {}
+    foreach ip $ip_instances {
+        set ip_cell [get_bd_cells $ip]
+        set ip_pins [get_bd_pins -of $ip_cell]
+        foreach pin $ip_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_start" $pin_name]} {
+                lappend ap_start_ports $pin
+            }
         }
     }
-}
 
-# Loop over IP instances to connect outputs to inputs
-for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
-    # Get current IP and next IP
-    set ip_i [lindex $ip_instances $i]
-    set ip_i_plus1 [lindex $ip_instances [expr {$i + 1}]]
-
-    # Get bd_cells for each IP
-    set ip_i_cell [get_bd_cells $ip_i]
-    set ip_i_plus1_cell [get_bd_cells $ip_i_plus1]
-
-    if {$interface_type == "unpacked"} {
-        # Existing unpacked interface connection logic
-        # Get all output pins from ip_i
-        set output_ports [get_bd_pins -of $ip_i_cell]
-
-        # Initialize arrays for output ports
-        array unset layer_out_ports_by_index
-        array unset layer_out_vld_ports_by_index
-
-        # Filter output ports and extract indices
-        foreach port $output_ports {
-            set port_name [get_property NAME $port]
-            # Match 'layer_out_<index>' or 'layer<layerN>_out_<index>'
-            if {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name all index]} {
-                set layer_out_ports_by_index($index) $port
-            } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $port_name all index]} {
-                set layer_out_vld_ports_by_index($index) $port
+    # Loop over IP instances to connect outputs to inputs
+    for {set i 0} {$i < [expr {[llength $ip_instances] - 1}]} {incr i} {
+        # Get current IP and next IP
+        set ip_i [lindex $ip_instances $i]
+        set ip_i_plus1 [lindex $ip_instances [expr {$i + 1}]]
+
+        # Get bd_cells for each IP
+        set ip_i_cell [get_bd_cells $ip_i]
+        set ip_i_plus1_cell [get_bd_cells $ip_i_plus1]
+
+        if {$interface_type == "unpacked"} {
+            # Existing unpacked interface connection logic
+            # Get all output pins from ip_i
+            set output_ports [get_bd_pins -of $ip_i_cell]
+
+            # Initialize arrays for output ports
+            array unset layer_out_ports_by_index
+            array unset layer_out_vld_ports_by_index
+
+            # Filter output ports and extract indices
+            foreach port $output_ports {
+                set port_name [get_property NAME $port]
+                # Match 'layer_out_<index>' or 'layer<layerN>_out_<index>'
+                if {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name all index]} {
+                    set layer_out_ports_by_index($index) $port
+                } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $port_name all index]} {
+                    set layer_out_vld_ports_by_index($index) $port
+                }
             }
-        }
 
-        # Get all input pins from ip_i_plus1
-        set input_ports [get_bd_pins -of $ip_i_plus1_cell]
-
-        # Initialize arrays for input ports
-        array unset input_ports_by_index
-        array unset input_vld_ports_by_index
-
-        # Filter input ports and extract indices
-        foreach port $input_ports {
-            set port_name [get_property NAME $port]
-            # Match '{name}_input_{index}'
-            if {[regexp {^\w+_input_(\d+)$} $port_name all index]} {
-                set input_ports_by_index($index) $port
-            } elseif {[regexp {^\w+_input_(\d+)_ap_vld$} $port_name all index]} {
-                set input_vld_ports_by_index($index) $port
+            # Get all input pins from ip_i_plus1
+            set input_ports [get_bd_pins -of $ip_i_plus1_cell]
+
+            # Initialize arrays for input ports
+            array unset input_ports_by_index
+            array unset input_vld_ports_by_index
+
+            # Filter input ports and extract indices
+            foreach port $input_ports {
+                set port_name [get_property NAME $port]
+                # Match '{name}_input_{index}'
+                if {[regexp {^\w+_input_(\d+)$} $port_name all index]} {
+                    set input_ports_by_index($index) $port
+                } elseif {[regexp {^\w+_input_(\d+)_ap_vld$} $port_name all index]} {
+                    set input_vld_ports_by_index($index) $port
+                }
+            }
+
+            # Connect data signals
+            foreach index [array names layer_out_ports_by_index] {
+                set out_port $layer_out_ports_by_index($index)
+                if {[info exists input_ports_by_index($index)]} {
+                    set in_port $input_ports_by_index($index)
+                    # Connect the ports
+                    connect_bd_net $out_port $in_port
+                } else {
+                    puts "Warning: No matching input port found for output [get_property NAME $out_port]"
+                }
+            }
+
+            # Connect ap_vld signals
+            foreach index [array names layer_out_vld_ports_by_index] {
+                set out_vld_port $layer_out_vld_ports_by_index($index)
+                if {[info exists input_vld_ports_by_index($index)]} {
+                    set in_vld_port $input_vld_ports_by_index($index)
+                    # Connect the ports
+                    connect_bd_net $out_vld_port $in_vld_port
+                } else {
+                    puts "Warning: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
+                }
+            }
+
+            # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
+            # Get 'ap_done' pin of ip_i
+            set ip_i_pins [get_bd_pins -of $ip_i_cell]
+            set ap_done_pin ""
+            foreach pin $ip_i_pins {
+                set pin_name [get_property NAME $pin]
+                if {[string match "ap_done" $pin_name]} {
+                    set ap_done_pin $pin
+                    break
+                }
+            }
+
+            # Get 'ap_start' pin of ip_i_plus1
+            set ip_i_plus1_pins [get_bd_pins -of $ip_i_plus1_cell]
+            set ap_start_pin ""
+            foreach pin $ip_i_plus1_pins {
+                set pin_name [get_property NAME $pin]
+                if {[string match "ap_start" $pin_name]} {
+                    set ap_start_pin $pin
+                    break
+                }
             }
-        }
 
-        # Connect data signals
-        foreach index [array names layer_out_ports_by_index] {
-            set out_port $layer_out_ports_by_index($index)
-            if {[info exists input_ports_by_index($index)]} {
-                set in_port $input_ports_by_index($index)
-                # Connect the ports
-                connect_bd_net $out_port $in_port
+            # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
+            if {[string length $ap_done_pin] > 0 && [string length $ap_start_pin] > 0} {
+                connect_bd_net $ap_done_pin $ap_start_pin
+                puts "Connected 'ap_done' of $ip_i to 'ap_start' of $ip_i_plus1"
             } else {
-                puts "Warning: No matching input port found for output [get_property NAME $out_port]"
+                puts "Warning: Could not find 'ap_done' or 'ap_start' pin for IPs $ip_i and $ip_i_plus1"
+            }
+        } elseif {$interface_type == "axi_stream"} {
+            # Get AXI Stream interface pins from ip_i and ip_i_plus1
+            set ip_i_intf_pins [get_bd_intf_pins -of $ip_i_cell]
+            set ip_i_plus1_intf_pins [get_bd_intf_pins -of $ip_i_plus1_cell]
+            set ip_i_axis_master ""
+            set ip_i_plus1_axis_slave ""
+
+            # Identify the Master (output) AXI Stream interface of ip_i
+            foreach intf_pin $ip_i_intf_pins {
+                set pin_name [get_property NAME $intf_pin]
+                # Assuming output interfaces have names ending with 'out'
+                if {[string match "*out" $pin_name]} {
+                    set ip_i_axis_master $intf_pin
+                    break
+                }
+            }
+
+            # Identify the Slave (input) AXI Stream interface of ip_i_plus1
+            foreach intf_pin $ip_i_plus1_intf_pins {
+                set pin_name [get_property NAME $intf_pin]
+                # Assuming input interfaces have names ending with 'input'
+                if {[string match "*input" $pin_name]} {
+                    set ip_i_plus1_axis_slave $intf_pin
+                    break
+                }
             }
-        }
 
-        # Connect ap_vld signals
-        foreach index [array names layer_out_vld_ports_by_index] {
-            set out_vld_port $layer_out_vld_ports_by_index($index)
-            if {[info exists input_vld_ports_by_index($index)]} {
-                set in_vld_port $input_vld_ports_by_index($index)
-                # Connect the ports
-                connect_bd_net $out_vld_port $in_vld_port
+            # Check if both interfaces are found
+            if {[string length $ip_i_axis_master] > 0 && [string length $ip_i_plus1_axis_slave] > 0} {
+                # Connect the AXI Stream interfaces
+                connect_bd_intf_net $ip_i_axis_master $ip_i_plus1_axis_slave
+                puts "Connected AXI Stream interface between $ip_i and $ip_i_plus1"
             } else {
-                puts "Warning: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
+                puts "Warning: Could not find matching AXI Stream interfaces for $ip_i and $ip_i_plus1"
             }
         }
+    }
 
-        # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
-        # Get 'ap_done' pin of ip_i
-        set ip_i_pins [get_bd_pins -of $ip_i_cell]
-        set ap_done_pin ""
-        foreach pin $ip_i_pins {
-            set pin_name [get_property NAME $pin]
-            if {[string match "ap_done" $pin_name]} {
-                set ap_done_pin $pin
-                break
+    if {$interface_type == "axi_stream"} {
+        # Create external port for 'ap_start' and connect all 'ap_start' pins
+        # ap_start in streaming IPs needs to be constantly high
+        if {[llength $ap_start_ports] > 0} {
+            create_bd_port -dir I ap_start
+            set ap_start_port [get_bd_ports ap_start]
+            foreach start_pin $ap_start_ports {
+                connect_bd_net $ap_start_port $start_pin
             }
         }
 
-        # Get 'ap_start' pin of ip_i_plus1
-        set ip_i_plus1_pins [get_bd_pins -of $ip_i_plus1_cell]
-        set ap_start_pin ""
-        foreach pin $ip_i_plus1_pins {
-            set pin_name [get_property NAME $pin]
-            if {[string match "ap_start" $pin_name]} {
-                set ap_start_pin $pin
-                break
+        # Make external all input interfaces of the first IP
+        set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+        if {[string length $first_ip_cell] == 0} {
+            puts "Error: Could not find the first IP cell."
+            return
+        }
+        set first_ip_intf_pins [get_bd_intf_pins -of $first_ip_cell]
+        set input_pin_names {}
+        foreach intf_pin $first_ip_intf_pins {
+            set pin_name [get_property NAME $intf_pin]
+            if {[string match "*s_axis*" $pin_name] || [string match "*inp*" $pin_name]} {
+                # Make the interface pin external
+                make_bd_intf_pins_external $intf_pin
+                # Retrieve the external interface port
+                set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+                # Change name to base_name
+                set_property NAME $pin_name $external_intf_port
+                lappend input_pin_names $pin_name
             }
         }
+        if {[llength $input_pin_names] == 0} {
+            puts "Error: Could not find any input AXI Stream interfaces for first IP."
+            return
+        }
 
-        # Connect 'ap_done' of ip_i to 'ap_start' of ip_i_plus1
-        if {[string length $ap_done_pin] > 0 && [string length $ap_start_pin] > 0} {
-            connect_bd_net $ap_done_pin $ap_start_pin
-            puts "Connected 'ap_done' of $ip_i to 'ap_start' of $ip_i_plus1"
-        } else {
-            puts "Warning: Could not find 'ap_done' or 'ap_start' pin for IPs $ip_i and $ip_i_plus1"
+        # Make external all output interfaces of the last IP
+        set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+        if {[string length $last_ip_cell] == 0} {
+            puts "Error: Could not find the last IP cell."
+            return
         }
-    } elseif {$interface_type == "axi_stream"} {
-        # Get AXI Stream interface pins from ip_i and ip_i_plus1
-        set ip_i_intf_pins [get_bd_intf_pins -of $ip_i_cell]
-        set ip_i_plus1_intf_pins [get_bd_intf_pins -of $ip_i_plus1_cell]
-        set ip_i_axis_master ""
-        set ip_i_plus1_axis_slave ""
-
-        # Identify the Master (output) AXI Stream interface of ip_i
-        foreach intf_pin $ip_i_intf_pins {
+        set last_ip_intf_pins [get_bd_intf_pins -of $last_ip_cell]
+        set output_pin_names {}
+        foreach intf_pin $last_ip_intf_pins {
             set pin_name [get_property NAME $intf_pin]
-            # Assuming output interfaces have names ending with 'out'
-            if {[string match "*out" $pin_name]} {
-                set ip_i_axis_master $intf_pin
-                break
+            if {[string match "*m_axis*" $pin_name] || [string match "*out*" $pin_name]} {
+                # Make the interface pin external
+                make_bd_intf_pins_external $intf_pin
+                # Retrieve the external interface port and change name to base name
+                set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
+                set_property NAME $pin_name $external_intf_port
+                lappend output_pin_names $pin_name
             }
         }
+        if {[llength $output_pin_names] == 0} {
+            puts "Error: Could not find any output AXI Stream interfaces for last IP."
+            return
+        }
 
-        # Identify the Slave (input) AXI Stream interface of ip_i_plus1
-        foreach intf_pin $ip_i_plus1_intf_pins {
-            set pin_name [get_property NAME $intf_pin]
-            # Assuming input interfaces have names ending with 'input'
-            if {[string match "*input" $pin_name]} {
-                set ip_i_plus1_axis_slave $intf_pin
+        # Associate input, output, and ap_rst to run at 'ap_clk'
+        # Join interface names with colons to match the required format
+        set associated_busif [join [concat $input_pin_names $output_pin_names] ":"]
+        set_property CONFIG.ASSOCIATED_BUSIF {$associated_busif} [get_bd_ports /ap_clk]
+        set_property CONFIG.ASSOCIATED_RESET $rst_port_name [get_bd_ports /ap_clk]
+        
+        # Make external the 'ap_done' signal of the last IP
+        set last_ip_pins [get_bd_pins -of $last_ip_cell]
+        set last_ap_done_pin ""
+        foreach pin $last_ip_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_done" $pin_name]} {
+                set last_ap_done_pin $pin
                 break
             }
         }
-
-        # Check if both interfaces are found
-        if {[string length $ip_i_axis_master] > 0 && [string length $ip_i_plus1_axis_slave] > 0} {
-            # Connect the AXI Stream interfaces
-            connect_bd_intf_net $ip_i_axis_master $ip_i_plus1_axis_slave
-            puts "Connected AXI Stream interface between $ip_i and $ip_i_plus1"
+        if {[string length $last_ap_done_pin] > 0} {
+            create_bd_port -dir O ap_done
+            set ap_done_port [get_bd_ports ap_done]
+            connect_bd_net $ap_done_port $last_ap_done_pin
         } else {
-            puts "Warning: Could not find matching AXI Stream interfaces for $ip_i and $ip_i_plus1"
+            puts "Warning: Could not find 'ap_done' pin for last IP"
         }
-    }
-}
-
-if {$interface_type == "axi_stream"} {
-    # Create external port for 'ap_start' and connect all 'ap_start' pins
-    # ap_start in streaming IPs needs to be constantly high
-    if {[llength $ap_start_ports] > 0} {
-        create_bd_port -dir I ap_start
-        set ap_start_port [get_bd_ports ap_start]
-        foreach start_pin $ap_start_ports {
-            connect_bd_net $ap_start_port $start_pin
+        
+    } elseif {$interface_type == "unpacked"} {
+        # Make 'ap_start' of the first IP external
+        set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
+        if {[string length $first_ip_cell] == 0} {
+            puts "Error: Could not find the first IP cell."
+            return
         }
-    }
-
-    # Make external all input interfaces of the first IP
-    set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
-    if {[string length $first_ip_cell] == 0} {
-        puts "Error: Could not find the first IP cell."
-        return
-    }
-    set first_ip_intf_pins [get_bd_intf_pins -of $first_ip_cell]
-    set input_pin_names {}
-    foreach intf_pin $first_ip_intf_pins {
-        set pin_name [get_property NAME $intf_pin]
-        if {[string match "*s_axis*" $pin_name] || [string match "*inp*" $pin_name]} {
-            # Make the interface pin external
-            make_bd_intf_pins_external $intf_pin
-            # Retrieve the external interface port
-            set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-            # Change name to base_name
-            set_property NAME $pin_name $external_intf_port
-            lappend input_pin_names $pin_name
+        set first_ip_pins [get_bd_pins -of $first_ip_cell]
+        set first_ap_start_pin ""
+        foreach pin $first_ip_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_start" $pin_name]} {
+                set first_ap_start_pin $pin
+                break
+            }
         }
-    }
-    if {[llength $input_pin_names] == 0} {
-        puts "Error: Could not find any input AXI Stream interfaces for first IP."
-        return
-    }
-
-    # Make external all output interfaces of the last IP
-    set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
-    if {[string length $last_ip_cell] == 0} {
-        puts "Error: Could not find the last IP cell."
-        return
-    }
-    set last_ip_intf_pins [get_bd_intf_pins -of $last_ip_cell]
-    set output_pin_names {}
-    foreach intf_pin $last_ip_intf_pins {
-        set pin_name [get_property NAME $intf_pin]
-        if {[string match "*m_axis*" $pin_name] || [string match "*out*" $pin_name]} {
-            # Make the interface pin external
-            make_bd_intf_pins_external $intf_pin
-            # Retrieve the external interface port and change name to base name
-            set external_intf_port [get_bd_intf_ports -filter "NAME =~ \"${pin_name}*\""]
-            set_property NAME $pin_name $external_intf_port
-            lappend output_pin_names $pin_name
+        if {[string length $first_ap_start_pin] > 0} {
+            create_bd_port -dir I ap_start
+            set ap_start_port [get_bd_ports ap_start]
+            connect_bd_net $ap_start_port $first_ap_start_pin
+        } else {
+            puts "Warning: Could not find 'ap_start' pin for first IP"
         }
-    }
-    if {[llength $output_pin_names] == 0} {
-        puts "Error: Could not find any output AXI Stream interfaces for last IP."
-        return
-    }
 
-    # Associate input, output, and ap_rst to run at 'ap_clk'
-    # Join interface names with colons to match the required format
-    set associated_busif [join [concat $input_pin_names $output_pin_names] ":"]
-    set_property CONFIG.ASSOCIATED_BUSIF {$associated_busif} [get_bd_ports /ap_clk]
-    set_property CONFIG.ASSOCIATED_RESET $rst_port_name [get_bd_ports /ap_clk]
-    
-    # Make external the 'ap_done' signal of the last IP
-    set last_ip_pins [get_bd_pins -of $last_ip_cell]
-    set last_ap_done_pin ""
-    foreach pin $last_ip_pins {
-        set pin_name [get_property NAME $pin]
-        if {[string match "ap_done" $pin_name]} {
-            set last_ap_done_pin $pin
-            break
+        # Make 'ap_done' of the last IP external
+        set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
+        if {[string length $last_ip_cell] == 0} {
+            puts "Error: Could not find the last IP cell."
+            return
         }
-    }
-    if {[string length $last_ap_done_pin] > 0} {
-        create_bd_port -dir O ap_done
-        set ap_done_port [get_bd_ports ap_done]
-        connect_bd_net $ap_done_port $last_ap_done_pin
-    } else {
-        puts "Warning: Could not find 'ap_done' pin for last IP"
-    }
-    
-} elseif {$interface_type == "unpacked"} {
-    # Make 'ap_start' of the first IP external
-    set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
-    if {[string length $first_ip_cell] == 0} {
-        puts "Error: Could not find the first IP cell."
-        return
-    }
-    set first_ip_pins [get_bd_pins -of $first_ip_cell]
-    set first_ap_start_pin ""
-    foreach pin $first_ip_pins {
-        set pin_name [get_property NAME $pin]
-        if {[string match "ap_start" $pin_name]} {
-            set first_ap_start_pin $pin
-            break
+        set last_ip_pins [get_bd_pins -of $last_ip_cell]
+        set last_ap_done_pin ""
+        foreach pin $last_ip_pins {
+            set pin_name [get_property NAME $pin]
+            if {[string match "ap_done" $pin_name]} {
+                set last_ap_done_pin $pin
+                break
+            }
         }
-    }
-    if {[string length $first_ap_start_pin] > 0} {
-        create_bd_port -dir I ap_start
-        set ap_start_port [get_bd_ports ap_start]
-        connect_bd_net $ap_start_port $first_ap_start_pin
-    } else {
-        puts "Warning: Could not find 'ap_start' pin for first IP"
-    }
-
-    # Make 'ap_done' of the last IP external
-    set last_ip_cell [get_bd_cells [lindex $ip_instances end]]
-    if {[string length $last_ip_cell] == 0} {
-        puts "Error: Could not find the last IP cell."
-        return
-    }
-    set last_ip_pins [get_bd_pins -of $last_ip_cell]
-    set last_ap_done_pin ""
-    foreach pin $last_ip_pins {
-        set pin_name [get_property NAME $pin]
-        if {[string match "ap_done" $pin_name]} {
-            set last_ap_done_pin $pin
-            break
+        if {[string length $last_ap_done_pin] > 0} {
+            create_bd_port -dir O ap_done
+            set ap_done_port [get_bd_ports ap_done]
+            connect_bd_net $ap_done_port $last_ap_done_pin
+        } else {
+            puts "Warning: Could not find 'ap_done' pin for last IP"
         }
-    }
-    if {[string length $last_ap_done_pin] > 0} {
-        create_bd_port -dir O ap_done
-        set ap_done_port [get_bd_ports ap_done]
-        connect_bd_net $ap_done_port $last_ap_done_pin
-    } else {
-        puts "Warning: Could not find 'ap_done' pin for last IP"
-    }
 
-    # Make external all inputs of the first IP (including 'vld' signals)
-    set input_pin_names {}
-    foreach pin $first_ip_pins {
-        set pin_name [get_property NAME $pin]
-        # Match patterns for inputs and input valid pins
-        if {[regexp {^\w+_(input|inp|layer)(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
-            # Make the pin external
-            make_bd_pins_external $pin
-            # Retrieve the external port and change name to base name
-            set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
-            set_property NAME $pin_name $external_port
-            lappend input_pin_names $pin_name
+        # Make external all inputs of the first IP (including 'vld' signals)
+        set input_pin_names {}
+        foreach pin $first_ip_pins {
+            set pin_name [get_property NAME $pin]
+            # Match patterns for inputs and input valid pins
+            if {[regexp {^\w+_(input|inp|layer)(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
+                # Make the pin external
+                make_bd_pins_external $pin
+                # Retrieve the external port and change name to base name
+                set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
+                set_property NAME $pin_name $external_port
+                lappend input_pin_names $pin_name
+            }
+        }
+        if {[llength $input_pin_names] == 0} {
+            puts "Error: Could not find any input pins for first IP."
+            return
         }
-    }
-    if {[llength $input_pin_names] == 0} {
-        puts "Error: Could not find any input pins for first IP."
-        return
-    }
 
-    # Make external all outputs of the last IP (including 'vld' signals)
-    set output_pin_names {}
-    foreach pin $last_ip_pins {
-        set pin_name [get_property NAME $pin]
-        # Match patterns for outputs and output valid pins
-        if {[regexp {^layer(?:\d+_)?out(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
-            # Make the pin external
-            make_bd_pins_external $pin
-            # Retrieve the external port and change name to base name
-            set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
-            set_property NAME $pin_name $external_port
-            lappend output_pin_names $pin_name
+        # Make external all outputs of the last IP (including 'vld' signals)
+        set output_pin_names {}
+        foreach pin $last_ip_pins {
+            set pin_name [get_property NAME $pin]
+            # Match patterns for outputs and output valid pins
+            if {[regexp {^layer(?:\d+_)?out(?:_(\d+))?(?:_ap_vld)?$} $pin_name]} {
+                # Make the pin external
+                make_bd_pins_external $pin
+                # Retrieve the external port and change name to base name
+                set external_port [get_bd_ports -filter "NAME =~ \"${pin_name}*\""]
+                set_property NAME $pin_name $external_port
+                lappend output_pin_names $pin_name
+            }
+        }
+        if {[llength $output_pin_names] == 0} {
+            puts "Error: Could not find any output pins for last IP."
+            return
         }
     }
-    if {[llength $output_pin_names] == 0} {
-        puts "Error: Could not find any output pins for last IP."
-        return
-    }
-}
 
-validate_bd_design
+    validate_bd_design
 
-regenerate_bd_layout
+    regenerate_bd_layout
 
-save_bd_design
+    save_bd_design
 
-puts "###########################################################"                                     
-puts "#   Successfully connected the ports of each IP instance   "
-puts "#   A total of $repo_count IPs were connected.             "
-puts "###########################################################"
+    puts "###########################################################"                                     
+    puts "#   Successfully connected the ports of each IP instance   "
+    puts "#   A total of $repo_count IPs were connected.             "
+    puts "###########################################################"
+
+}
+
+if {$stitch_design} {
+    stitch_procedure $base_dir $stitch_project_name $bd_name $part
+} else {
+    set existing_stitch_project_name [file join $stitch_project_name "$stitch_project_name.xpr"]
+    if {[file exists $existing_stitch_project_name]} {
+        puts "Opening existing project: $existing_stitch_project_name"
+        open_project $existing_stitch_project_name
+    } else {
+        puts "Error: Project file '$existing_stitch_project_name' does not exist."
+        exit 1
+    }
+}
 
 if {$export_design} {
     puts "Exporting the final stitched IP..."
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index fa93843268..289aa68e00 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -55,65 +55,111 @@ def parse_component_xml(component_xml_path):
 
 
 def generate_verilog_testbench(nn_config, testbench_output_path):
+    """
+    Generate a Verilog testbench for a given neural network configuration.
+    The testbench includes:
+      - Clock and reset logic
+      - DUT instantiation and AXI4-Stream interfaces
+      - Stimulus generation for inputs
+      - Data capture and logging for outputs
+      - Latency measurement
+    """
     inputs = nn_config['inputs']
     outputs = nn_config['outputs']
 
     input_signals = []
     output_signals = []
 
+    # Collect input signals (name and total bitwidth)
     for input_item in inputs:
         total_bits = input_item['integer_bits'] + input_item['fractional_bits']
         input_signals.append((input_item['name'], total_bits))
 
+    # Collect output signals (name and total bitwidth)
     for output_item in outputs:
         total_bits = output_item['integer_bits'] + output_item['fractional_bits']
         output_signals.append((output_item['name'], total_bits))
 
     with open(testbench_output_path, 'w') as f:
-        # Write the initial part of the testbench
+        #----------------------------------------------------------------------
+        # Header and Module Declaration
+        #----------------------------------------------------------------------
         f.write('`timescale 1ns / 1ps\n\n')
         f.write('module tb_design_1_wrapper;\n\n')
+
+        #----------------------------------------------------------------------
+        # Clock and Reset Signals
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock and Reset Signals\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg ap_clk;\n')
         f.write('    reg ap_rst_n;\n\n')
-        f.write('    // Control Signals\n')
-        f.write('    reg ap_start;\n')
+
+        #----------------------------------------------------------------------
+        # Control and Handshaking Signals
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Control and Handshaking Signals\n')
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    reg  ap_start;\n')
         f.write('    wire ap_done;\n\n')
 
-        # Generate AXI4-Stream interface signals for inputs
+        #----------------------------------------------------------------------
+        # AXI4-Stream Input Interfaces
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // AXI4-Stream Input Interfaces\n')
+        f.write('    //------------------------------------------------------------------------\n')
+
         for layer in nn_config['inputs']:
             total_bits = layer['integer_bits'] + layer['fractional_bits']
-            f.write(f'    reg [{(total_bits * layer["batch_size"]) - 1}:0] {layer["name"]}_tdata;\n')
-            f.write(f'    reg {layer["name"]}_tvalid;\n')
+            batch_size = layer['batch_size']
+            f.write(f'    reg  [{(total_bits * batch_size) - 1}:0] {layer["name"]}_tdata;\n')
+            f.write(f'    reg  {layer["name"]}_tvalid;\n')
             f.write(f'    wire {layer["name"]}_tready;\n\n')
 
-        # Generate AXI4-Stream interface signals for outputs
+        #----------------------------------------------------------------------
+        # AXI4-Stream Output Interfaces
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // AXI4-Stream Output Interfaces\n')
+        f.write('    //------------------------------------------------------------------------\n')
+
         for layer in nn_config['outputs']:
             total_bits = layer['integer_bits'] + layer['fractional_bits']
-            f.write(f'    wire [{(total_bits * layer["batch_size"]) - 1}:0] {layer["name"]}_tdata;\n')
+            batch_size = layer['batch_size']
+            f.write(f'    wire [{(total_bits * batch_size) - 1}:0] {layer["name"]}_tdata;\n')
             f.write(f'    wire {layer["name"]}_tvalid;\n')
-            f.write(f'    reg {layer["name"]}_tready;\n\n')
-
-        # Instantiate the DUT
-        f.write('    // Instantiate the Design Under Test (DUT)\n')
+            f.write(f'    reg  {layer["name"]}_tready;\n\n')
+
+        #----------------------------------------------------------------------
+        # DUT Instantiation
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // DUT Instantiation\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    stitched_design dut (\n')
         f.write('        .ap_clk(ap_clk),\n')
         f.write('        .ap_done(ap_done),\n')
         f.write('        .ap_rst_n(ap_rst_n),\n')
         f.write('        .ap_start(ap_start),\n')
-        # Connect input AXI4-Stream interfaces
+
+        # Connect input interfaces
         for layer in nn_config['inputs']:
             name = layer["name"]
             f.write(f'        .{name}_tdata({name}_tdata),\n')
             f.write(f'        .{name}_tready({name}_tready),\n')
             f.write(f'        .{name}_tvalid({name}_tvalid),\n')
-        # Connect output AXI4-Stream interfaces
+
+        # Connect output interfaces (all but last have trailing comma)
         for layer in nn_config['outputs'][:-1]:
             name = layer["name"]
             f.write(f'        .{name}_tdata({name}_tdata),\n')
             f.write(f'        .{name}_tready({name}_tready),\n')
             f.write(f'        .{name}_tvalid({name}_tvalid),\n')
-        # Handle the last output layer without a trailing comma
+
+        # Last output interface (no trailing comma)
         last_output_layer = nn_config['outputs'][-1]
         name = last_output_layer["name"]
         f.write(f'        .{name}_tdata({name}_tdata),\n')
@@ -121,26 +167,37 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         f.write(f'        .{name}_tvalid({name}_tvalid)\n')
         f.write('    );\n\n')
 
-        # Add clock generation
-        f.write('    // Clock Generation (100 MHz)\n')
+        #----------------------------------------------------------------------
+        # Clock Generation
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Clock Generation (100 MHz => 10 ns period)\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
         f.write('        ap_clk = 0;\n')
-        f.write('        forever #5 ap_clk = ~ap_clk; // Clock period of 10 ns\n')
+        f.write('        forever #5 ap_clk = ~ap_clk;\n')
         f.write('    end\n\n')
 
-        # Reset generation
+        #----------------------------------------------------------------------
+        # Reset Generation
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Reset Generation\n')
+        f.write('    // Wait for a few cycles and then release reset.\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
-        f.write('        ap_rst_n  = 0;\n')
+        f.write('        ap_rst_n = 0;\n')
         f.write('        repeat (5) @(posedge ap_clk);\n')
         f.write('        ap_rst_n = 1;\n')
         f.write('    end\n\n')
 
-        # Initialize Control Signals
-        f.write('    // Control Signal Initialization\n')
-        f.write('    integer csv_file;\n')
-        f.write('    integer j;\n')
-        f.write('    integer total_bits;\n')
+        #----------------------------------------------------------------------
+        # Signal Initialization
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Signal Initialization\n')
+        f.write('    // Initialize control signals, input valid, and output ready.\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
         f.write('        ap_start = 0;\n')
         for name, _ in input_signals:
@@ -149,11 +206,28 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'        {name}_tready = 1;\n')
         f.write('    end\n\n')
 
-        # Cycle counter
-        f.write('    // Cycle counter\n')
+        #----------------------------------------------------------------------
+        # Variables for Logging and Measurement
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Logging and Measurement Variables\n')
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    integer csv_file;\n')
+        f.write('    integer j;\n')
+        f.write('    integer total_bits;\n')
         f.write('    reg [63:0] cycle_count = 0;\n')
         f.write('    reg [63:0] start_cycle = 0;\n')
         f.write('    reg [63:0] end_cycle = 0;\n')
+        f.write('    reg [1:0] done_counter = 0;\n')
+        f.write('    reg       old_ap_done = 0;\n\n')
+
+        #----------------------------------------------------------------------
+        # Cycle Counting
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Cycle Counting\n')
+        f.write('    // Count cycles to measure latency.\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    always @(posedge ap_clk) begin\n')
         f.write('        if (!ap_rst_n)\n')
         f.write('            cycle_count <= 0;\n')
@@ -161,101 +235,130 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
         f.write('            cycle_count <= cycle_count + 1;\n')
         f.write('    end\n\n')
 
-        # Data Transmission
-        f.write('    // Data Transmission\n')
+        #----------------------------------------------------------------------
+        # Data Transmission (Stimulus Generation)
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Data Transmission (Stimulus)\n')
+        f.write('    // Send input patterns to the DUT.\n')
+        f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
-        f.write('        // Wait for reset deassertion\n')
+        f.write('        // Wait until reset is de-asserted\n')
         f.write('        wait (ap_rst_n == 1);\n')
         f.write('        repeat (2) @(posedge ap_clk);\n\n')
 
-        f.write('        // Start the operation\n')
+        f.write('        // Open CSV log file\n')
         f.write('        csv_file = $fopen("testbench_log.csv", "w");\n')
         f.write('        if (csv_file == 0) begin\n')
-        f.write('            $display("ERROR: Could not open csv log file.");\n')
-        f.write('             $finish;\n')
+        f.write('            $display("ERROR: Could not open CSV log file.");\n')
+        f.write('            $finish;\n')
         f.write('        end\n')
         f.write('        $fwrite(csv_file, "output_name,index,value\\n");\n\n')
-        f.write('        ap_start = 1;\n')
 
-        # First Data Pattern: All Zeros
-        for layer in nn_config['inputs']:
-            f.write(f'        // Sending all zeros for {layer["name"]}\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n')
-            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
-            for k in range(layer['batch_size']):
-                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
-                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
-                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 0;\n')
-            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
-            f.write(f'            @(posedge ap_clk);\n')
-            f.write(f'        end\n')
-            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
-
-        # Second Data Pattern: Fixed Value of 1
+        f.write('        // Start the DUT\n')
+        f.write('        ap_start = 1;\n\n')
+
+        # Send first pattern of inputs (all zeroes)
         for layer in nn_config['inputs']:
-            f.write(f'        // Sending fixed value 1 for {layer["name"]}\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n')
-            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
-            for k in range(layer['batch_size']):
-                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
-                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
-                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 1 << {layer["fractional_bits"]};\n')
-            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
-            f.write(f'            @(posedge ap_clk);\n')
-            f.write(f'        end\n')
-            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
-
-        f.write('        start_cycle = cycle_count;\n\n')
-        # Third Data Pattern: All zeros (here measure output and cycles)
+            i_bits = layer["integer_bits"]
+            f_bits = layer["fractional_bits"]
+            total_bits = i_bits + f_bits
+            batch_size = layer['batch_size']
+            fifo_depth = layer["fifo_depth"]
+            name = layer["name"]
+            f.write(f'        // Sending 1st patern of inputs for {name}\n')
+            f.write(f'        {name}_tvalid = 1;\n')
+            f.write(f'        for (j = 0; j < {fifo_depth}; j = j + 1) begin\n')
+            for k in range(batch_size):
+                upper = (k + 1) * total_bits - 1
+                lower = k * total_bits
+                f.write(f'            {name}_tdata[{upper}:{lower}] = 0;\n')
+            f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
+            f.write('            @(posedge ap_clk);\n')
+            f.write('        end\n')
+            f.write(f'        {name}_tvalid = 0;\n\n')
+
+        # Send second pattern of inputs
         for layer in nn_config['inputs']:
-            f.write(f'        // Sending all zeros for {layer["name"]} (here we measure output and cycles)\n')
-            f.write(f'        {layer["name"]}_tvalid = 1;\n')
-            f.write(f'        for (j = 0; j < {layer["fifo_depth"]}; j = j + 1) begin\n')
-            for k in range(layer['batch_size']):
-                upper = (k + 1) * (layer["integer_bits"] + layer["fractional_bits"]) - 1
-                lower = k * (layer["integer_bits"] + layer["fractional_bits"])
-                f.write(f'            {layer["name"]}_tdata[{upper}:{lower}] = 0;\n')
-            f.write(f'            while ({layer["name"]}_tready == 0) @(posedge ap_clk);\n')
-            f.write(f'            @(posedge ap_clk);\n')
-            f.write(f'        end\n')
-            f.write(f'        {layer["name"]}_tvalid = 0;\n\n')
-
-        f.write('        // Wait for operation to complete\n')
-        f.write('        wait (ap_done == 1);\n')
-        f.write('        end_cycle = cycle_count;\n')
-        f.write('        $display("Total cycles from start to done: %0d", end_cycle - start_cycle);\n')
-        f.write('        // Write latency to JSON\n')
-        f.write('        $fwrite(csv_file, "latency_cycles,0,%0d\\n", end_cycle - start_cycle);\n')
-        f.write('        repeat (2) @(posedge ap_clk);\n')
-        f.write('        $fclose(csv_file);\n')
-        f.write('        $finish;\n')
+            i_bits = layer["integer_bits"]
+            f_bits = layer["fractional_bits"]
+            total_bits = i_bits + f_bits
+            batch_size = layer['batch_size']
+            fifo_depth = layer["fifo_depth"]
+            name = layer["name"]
+            f.write(f'        // Sending 2nd pattern of inputs for {name}\n')
+            f.write(f'        {name}_tvalid = 1;\n')
+            f.write(f'        for (j = 0; j < {fifo_depth}; j = j + 1) begin\n')
+            for k in range(batch_size):
+                upper = (k + 1) * total_bits - 1
+                lower = k * total_bits
+                f.write(f'            {name}_tdata[{upper}:{lower}] = 1 << {f_bits};\n')
+            f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
+            f.write('            @(posedge ap_clk);\n')
+            f.write('        end\n')
+
         f.write('    end\n\n')
 
-        # Output Handling
-        f.write('    // Output Data Capture\n')
+        #----------------------------------------------------------------------
+        # Output Data Capture and Logging
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Output Data Capture and Logging\n')
+        f.write('    // Capture output for 2nd input (done_counter == 1) and log them to CSV.\n')
+        f.write('    //------------------------------------------------------------------------\n\n')
+
         for i, layer in enumerate(nn_config['outputs']):
-            signed_str = layer.get('signed', 1)
             i_bits = layer['integer_bits']
             f_bits = layer['fractional_bits']
             total_bits = i_bits + f_bits
             layer_name = layer["name"]
 
+            f.write(f'    //Output capture for {layer_name}\n')
             f.write(f'    integer idx_{i};\n')
             f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
             f.write(f'    real real_val_{i};\n')
-
             f.write(f'    always @(posedge ap_clk) begin\n')
-            f.write(f'        if ({layer_name}_tvalid && {layer_name}_tready) begin\n')
+            f.write(f'        if (done_counter == 1 && {layer_name}_tvalid && {layer_name}_tready) begin\n')
             f.write(f'            for (idx_{i} = 0; idx_{i} < {layer["batch_size"]}; idx_{i} = idx_{i} + 1) begin\n')
             f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
-            f.write(f'                real_val_{i} = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-            f.write(f'                $display("Output {layer["name"]}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
-            f.write('                // Write to csv file\n')
+            f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
+            f.write(f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
+            f.write('                // Log result to CSV\n')
             f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
             f.write('            end\n')
             f.write('        end\n')
             f.write('    end\n\n')
 
+        #----------------------------------------------------------------------
+        # Latency Measurement and Test End
+        #----------------------------------------------------------------------
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    // Latency Measurement\n')
+        f.write('    // Measures the cycle count between start and subsequent ap_done signals.\n')
+        f.write('    //------------------------------------------------------------------------\n')
+        f.write('    always @(posedge ap_clk) begin\n')
+        f.write('        if (!ap_rst_n) begin\n')
+        f.write('            old_ap_done <= 0;\n')
+        f.write('        end else begin\n')
+        f.write('            old_ap_done <= ap_done;\n')
+        f.write('            // Detect rising edge of ap_done\n')
+        f.write('            if (ap_done && !old_ap_done) begin\n')
+        f.write('                done_counter <= done_counter + 1;\n')
+        f.write('                if (done_counter == 0) begin\n')
+        f.write('                    start_cycle = cycle_count;\n')
+        f.write('                    $display("Worst latency (first input set): %0d cycles", cycle_count);\n')
+        f.write('                    $fwrite(csv_file, "%s,%0d,%0d\\n", "WorstLatency", 0, cycle_count);\n')
+        f.write('                end else if (done_counter == 1) begin\n')
+        f.write('                    end_cycle = cycle_count;\n')
+        f.write('                    $display("Best latency (second input set): %0d cycles", end_cycle - start_cycle);\n')
+        f.write('                    $fwrite(csv_file, "%s,%0d,%0d\\n", "BestLatency", 0, end_cycle - start_cycle);\n')
+        f.write('                    $fclose(csv_file);\n')
+        f.write('                    $finish;\n')
+        f.write('                end\n')
+        f.write('            end\n')
+        f.write('        end\n')
+        f.write('    end\n\n')
+
         f.write('endmodule\n')
 
 
@@ -267,20 +370,31 @@ def read_testbench_log(testbench_log_path):
         print(f"Error: The file '{testbench_log_path}' does not exist.")
         return {}
 
-    df = pd.read_csv(testbench_log_path)
-    latency = df[df['output_name'] == 'latency_cycles']['value'].iloc[0]
-    grouped = df[df['output_name'] != 'latency_cycles'].groupby('output_name')
-
-    sim_dict = {
-        'latency_cycles': int(latency),
-        'outputs': {}
-    }
-
-    for name, group in grouped:
-        indices = group['index'].astype(int)
-        values = group['value'].astype(float)
-        array = np.zeros(max(indices) + 1, dtype=float)
-        array[indices] = values
-        sim_dict['outputs'][name] = array
-
-    return sim_dict
+    try:
+        df = pd.read_csv(testbench_log_path)
+        BestLatency = df[df['output_name'] == 'BestLatency']['value'].iloc[0]
+        WorstLatency = df[df['output_name'] == 'WorstLatency']['value'].iloc[0]
+        output_df = df[~df['output_name'].isin(['BestLatency', 'WorstLatency'])]
+        
+        sim_dict = {
+            'BestLatency': int(BestLatency),
+            'WorstLatency': int(WorstLatency),
+            'outputs': {}
+        }
+
+        grouped = output_df.groupby('output_name')
+        for name, group in grouped:
+            indices = group['index'].astype(int)
+            values = group['value'].astype(float)
+            array = np.zeros(max(indices) + 1, dtype=float)
+            array[indices] = values
+            sim_dict['outputs'][name] = array
+
+        return sim_dict
+
+    except (KeyError, IndexError) as e:
+        print(f"Error: Missing expected columns or values in the file: {e}")
+        return {}
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return {}

From f1e2e57f41ea695eb39217dfd34458d23dd75432 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 18 Dec 2024 17:44:21 +0100
Subject: [PATCH 24/50] prepare testbench input from user

---
 hls4ml/backends/vitis/vitis_backend.py  | 39 ++++++++++++++----
 hls4ml/model/graph.py                   | 35 +++++++++++++++-
 hls4ml/templates/vivado/ip_stitcher.tcl | 41 ++++++++++---------
 hls4ml/utils/simulation_utils.py        | 53 ++++++++++++++++++++++++-
 4 files changed, 136 insertions(+), 32 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 9eb157079e..527f2de252 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -8,7 +8,7 @@
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
 from hls4ml.report import parse_vivado_report, aggregate_graph_reports
-from hls4ml.utils.simulation_utils import generate_verilog_testbench, read_testbench_log
+from hls4ml.utils.simulation_utils import write_verilog_testbench, read_testbench_log, write_testbench_input, prepare_testbench_input, prepare_zero_input
 
 
 class VitisBackend(VivadoBackend):
@@ -132,8 +132,17 @@ def build(
 
         return parse_vivado_report(output_dir)
     
-    def build_stitched_design(self, output_dir, project_name, stitch_design=True, sim_stitched_design=False, export_stitched_design=False, nn_config=None, graph_reports=None):
-        
+    def build_stitched_design(
+        self,
+        output_dir,
+        project_name,
+        stitch_design=True,
+        sim_stitched_design=False,
+        export_stitched_design=False,
+        nn_config=None,
+        graph_reports=None,
+        simulation_input_data=None):    
+
         os.makedirs(output_dir, exist_ok=True)
         stitched_design_dir = os.path.join(output_dir, project_name)
         if stitch_design:
@@ -141,11 +150,11 @@ def build_stitched_design(self, output_dir, project_name, stitch_design=True, si
                 raise FileExistsError(f"The directory '{stitched_design_dir}' already exists.")
             os.makedirs(stitched_design_dir)
 
-        spec = importlib.util.find_spec("hls4ml")
+        spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)
         ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')
-        nn_config_path = os.path.join(stitched_design_dir, "nn_config.json")
-        testbench_path =  os.path.join(stitched_design_dir, "testbench.v")
+        nn_config_path = os.path.join(stitched_design_dir, 'nn_config.json')
+        testbench_path =  os.path.join(stitched_design_dir, 'testbench.v')
 
         try:
             shutil.copy(ip_stitcher_path, stitched_design_dir)
@@ -157,8 +166,22 @@ def build_stitched_design(self, output_dir, project_name, stitch_design=True, si
                 json.dump(nn_config, file, indent=4)
         
         if(sim_stitched_design):
-            generate_verilog_testbench(nn_config, testbench_path)
-            print('Verilog testbench generated.')
+            write_verilog_testbench(nn_config, testbench_path)
+
+            # Produce testbench input file for every input layer
+            for i, layer in enumerate(nn_config['inputs']):
+                layer_name = layer['name']
+                frac_bits = layer['fractional_bits']
+                total_bits = layer['fractional_bits'] + layer['integer_bits']
+                testbench_input_path = os.path.join(stitched_design_dir, f"{layer_name}_input_data.txt")
+                # We reshape simulation input data to (fifo_depth, batch_size)
+                if simulation_input_data is None:
+                    input_data_reshaped = prepare_zero_input(layer)
+                else:
+                    data = simulation_input_data[i]
+                    input_data_reshaped = prepare_testbench_input(data, layer['fifo_depth'], layer['batch_size'])
+                write_testbench_input(input_data_reshaped, testbench_input_path, frac_bits, total_bits)
+            print('Verilog testbench and its input data was generated.')
 
         print('Running build process of stitched IP...\n')
         stitch_command = [
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 8544d60a4c..458c5aad04 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -6,6 +6,8 @@
 import numpy as np
 import numpy.ctypeslib as npc
 import copy
+import importlib.util
+import shutil
 import re
 import warnings
 import concurrent.futures
@@ -1106,6 +1108,7 @@ def build_wrapper(g, **kwargs):
                     build_results[project_name] = None
 
         self.graph_reports=build_results
+        self._replace_logos()
 
         if stitch_design or sim_stitched_design or export_stitched_design:
             nn_config = self.parse_nn_config()
@@ -1141,7 +1144,8 @@ def predict(self, x, sim_stitched_design = False):
                 sim_stitched_design=True,
                 export_stitched_design=False,
                 nn_config=nn_config,
-                graph_reports=self.graph_reports)
+                graph_reports=self.graph_reports,
+                simulation_input_data=x)
             return stitched_report
     
     def trace(self, x):
@@ -1163,4 +1167,31 @@ def _print_status(self, status):
             'Failed': '❌'
         }
         status_str = ' | '.join(f'{proj}: {status_icons.get(stat, "?")}' for proj, stat in status.items())
-        print(status_str, flush=True)
\ No newline at end of file
+        print(status_str, flush=True)
+
+    def _replace_logos(self):
+        spec = importlib.util.find_spec("hls4ml")
+        hls4ml_path = os.path.dirname(spec.origin)
+        hls4ml_logo = os.path.join(hls4ml_path, '../docs/img/logo.png')
+
+        if not os.path.isfile(hls4ml_logo):
+            raise FileNotFoundError(f"hls4ml logo not found at: {hls4ml_logo}")
+
+        for graph in self.graphs:
+            graph_logo_paths = [
+                os.path.join(
+                    graph.config.get_output_dir(),
+                    graph.config.get_project_name() + '_prj',
+                    'solution1/impl/misc/logo.png'
+                ),
+                os.path.join(
+                    graph.config.get_output_dir(),
+                    graph.config.get_project_name() + '_prj',
+                    'solution1/impl/ip/misc/logo.png'
+                )
+            ]
+            try:
+                for logo in graph_logo_paths:
+                    shutil.copy(hls4ml_logo, logo)
+            except Exception as e:
+                print(f"Error copying hls4ml logo to {graph.config.get_output_dir()} project: {e}")
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 6612064a6d..7c42c90036 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -545,7 +545,7 @@ if {$stitch_design} {
 }
 
 if {$export_design} {
-    puts "Exporting the final stitched IP..."
+    puts "Exporting stitched IP..."
     set stitched_ip_dir "ip_repo"
     ipx::package_project -root_dir $stitched_ip_dir \
         -vendor user.org -library user -taxonomy /UserIP -module $bd_name \
@@ -560,31 +560,30 @@ if {$export_design} {
 }
 
 if {$sim_design} {
-    puts "Adding simulation Verilog file..."
-    if {$sim_verilog_file != ""} {
-        if { [file exists "$base_dir/$sim_verilog_file"] } {
-            if { [llength [get_filesets sim_1]] == 0 } {
-                create_fileset -simset sim_1
-            }
-            set_property SOURCE_SET sources_1 [get_filesets sim_1]
-            add_files -fileset sim_1 -norecurse -scan_for_includes "$base_dir/$sim_verilog_file"
-            update_compile_order -fileset sim_1
-            puts "Simulation Verilog file added: $base_dir/$sim_verilog_file"
-            set_property top tb_design_1_wrapper [get_filesets sim_1]
-            set_property -name {xsim.simulate.runtime} -value {200000ns} -objects [get_filesets sim_1]
-            puts "##########################"
-            puts "#  Launching simulation  #"
-            puts "##########################"
-            launch_simulation
-        } else {
-            puts "Error: Simulation Verilog file not found: $base_dir/$sim_verilog_file"
-        }
-    } else {
+    if {$sim_verilog_file == ""} {
         puts "Error: sim_verilog_file not provided."
         exit 1
     }
+    if {![file exists "$base_dir/$sim_verilog_file"]} {
+        puts "Error: Simulation file not found: $base_dir/$sim_verilog_file"
+        exit 1
+    }
+    if {[llength [get_filesets sim_1]] == 0} {
+        create_fileset -simset sim_1
+    }
+    set_property SOURCE_SET sources_1 [get_filesets sim_1]
+    add_files -fileset sim_1 -norecurse -scan_for_includes "$base_dir/$sim_verilog_file"
+    update_compile_order -fileset sim_1
+    puts "Simulation Verilog file added: $base_dir/$sim_verilog_file"
+    set_property top tb_design_1_wrapper [get_filesets sim_1]
+    set_property -name {xsim.simulate.runtime} -value {800000ns} -objects [get_filesets sim_1]
+    puts "##########################"
+    puts "#  Launching simulation  #"
+    puts "##########################"
+    launch_simulation
 }
 
+
 close_project
 
 
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 289aa68e00..eee2192172 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -54,7 +54,7 @@ def parse_component_xml(component_xml_path):
     return inputs, outputs
 
 
-def generate_verilog_testbench(nn_config, testbench_output_path):
+def write_verilog_testbench(nn_config, testbench_output_path):
     """
     Generate a Verilog testbench for a given neural network configuration.
     The testbench includes:
@@ -361,6 +361,57 @@ def generate_verilog_testbench(nn_config, testbench_output_path):
 
         f.write('endmodule\n')
 
+def float_to_fixed(float_value, fractional_bits=10, total_bits=16):
+    scaling_factor = 1 << fractional_bits
+    max_val = (1 << (total_bits - 1)) - 1
+    min_val = -(1 << (total_bits - 1))
+
+    float_value = float(float_value)  # Convert to Python float if it's a numpy type
+
+    fixed_value = int(np.round(float_value * scaling_factor))
+    fixed_value = max(min(fixed_value, max_val), min_val)
+
+    if fixed_value < 0:
+        fixed_value = fixed_value + (1 << total_bits)  # Two's complement
+
+    return fixed_value
+
+def write_testbench_input(floats, file_name, fractional_bits=10, total_bits=16):
+    """
+    Convert 1D or 2D arrays (or lists of floats) to fixed-point and write to file.
+
+    If 'floats' is 1D: writes a single line.
+    If 'floats' is 2D: flattens each row and writes one line per row.
+    """
+    with open(file_name, "w") as f:
+        if len(floats) > 0 and isinstance(floats[0], (list, np.ndarray)):
+            for row in floats:
+                row_array = np.array(row).ravel()  # flatten if necessary
+                fixed_line = [float_to_fixed(val, fractional_bits, total_bits) for val in row_array]
+                f.write(" ".join(map(str, fixed_line)) + "\n")
+        else:
+            flattened = np.array(floats).ravel()  # ensure it's a flat array of scalars
+            fixed_line = [float_to_fixed(val, fractional_bits, total_bits) for val in flattened]
+            f.write(" ".join(map(str, fixed_line)) + "\n")
+
+
+def prepare_zero_input(layer):
+        batch_size = layer['batch_size']
+        fifo_depth = layer['fifo_depth']       
+        zero_input = np.zeros((fifo_depth, batch_size), dtype=np.int32)
+        return zero_input
+
+def prepare_testbench_input(data, fifo_depth, batch_size):
+    data_arr = np.array(data)
+    # Flatten the data and then reshape it
+    # Ensure that total elements = fifo_depth * batch_size
+    total_elements = fifo_depth * batch_size
+    if data_arr.size != total_elements:
+        raise ValueError(
+            f"Data size {data_arr.size} does not match fifo_depth * batch_size = {total_elements}"
+        )
+    data_reshaped = data_arr.reshape((fifo_depth, batch_size))
+    return data_reshaped
 
 def read_testbench_log(testbench_log_path):
     """

From 55db302ad573703e2369c24668ab592ae4ed7bab Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 19 Dec 2024 13:29:53 +0100
Subject: [PATCH 25/50] support for user-defined input in verilog testbench of
 stitched IP

---
 docs/img/logo_small.png                | Bin 0 -> 6433 bytes
 hls4ml/backends/vitis/vitis_backend.py |  39 +++++++++--------------
 hls4ml/model/graph.py                  |  15 ++++-----
 hls4ml/utils/simulation_utils.py       |  41 +++++++++++++++----------
 4 files changed, 48 insertions(+), 47 deletions(-)
 create mode 100644 docs/img/logo_small.png

diff --git a/docs/img/logo_small.png b/docs/img/logo_small.png
new file mode 100644
index 0000000000000000000000000000000000000000..a894cc86ac6b43baa1acfd51f62d7e49102106d1
GIT binary patch
literal 6433
zcmb`M_cz?n`}beF7Q2YHdM{BTL`jrHTP^D9qQ^$>Jz6ZQMf6T|AtHJ&VbutdAOz8(
zcS2%C5AJ<`{)GE{f0=WgGjq*#oonXI%=0lZy4tF@NSH_f0Jx>DrmPPDAiN1+phWm~
zy}9+j4v>$&sv=M`%CZ9h4D;&B3Wfns_bp-R8J{w)gG#LYC+<Gz9A_T+Auv=tiisuv
zID|rI8G1^MxyEuSrwud0kz%F;X=&=C{jvgWMN|rutns4Z80Hv$Ar!-#otKN-=#tjy
zOR+6ET=z!k%|*+NQ0wSUNwo3T>fcY$H`V5?6@8CYRaJXqs8A3^>M#*rO!B0(e-2e0
zM-QZjAdCw74DuRKG&%qI{4oZG83ec~`=BDA$U5AIy?6i)B!M1tL{g!^&}SPy(~KZq
z48TsXDw0@NCo;vxsP{O|q9WXfD2clVvh1aXHh<>kCiF;Jx;lZ5j;?}aY-Z+FON)f1
z-DtjabV>@9X7={BqgKuX^?KGXns$-t85vQLkr1O<sR$2zgg-)yY#s@)p0(tchEZAo
zsKy;1%($p`^*)H93h>W$0DIE+j@|nPp<KRj07E?qnFQtFm|hak)<9mspNecB_n%Yf
z|AA9&Wu*_4us{yo?wo-{BFW$Mn0-ml%zP9H-rCx_+oJTQ$MCUQc6K%kNmT{ok^??<
z+0CC^+{=>q!kr}um{30MIsMOM^RX04y|XYiMc6nSzq0Zyj!`?b?`E%Jm_jT%I+~k{
z3)~mFb2FI8>T<ru{jtlK5{~&TYV1~DTKcKsahW=s^Bn;JlJfF$H7zZ&<{IYjR=j``
zD$Bj8q=aC7a?O`942zZN@9!63d8&+o^~5djmbX_`ktr!D1;s`xFP@zRXyr+X-oDr^
z*UlM{b#!uz@H~PGRi~y=rzBJUo`k?LWxD<)1vs4d-_uj+spRx@TT|v-E#71&mMSL{
z3P7XLnfqEe|5xyZ+scw8UiyzQavb8~&lyqz_mf_6aLeA}HE3$);sqj9Go+HE``;W3
z@8?E4S5#Ce>gXgXox&(iNQaW3IC>+aX-6ipS@`{Gb{bv)nH<OmN1xxbXj|3|R8Rm7
z#51nt6A`@sad6;nfejc}Saq6sXX-~G^^jZF^!Yf`d%K&v=;PAGoBZ@bm(K$b)Uu<%
zojYE-<kioeM!xQdfmgq3e&$q5RM1X@{{8#6_)Y!ii=>p4l%iT}U9|>8*W5fQtXILM
zV14~|oE5%KMdQjYi?OjJE_+R*#-z34YKBU=rJmN8`?`nKg9!;EB5|3dFG+q8Cv0ul
zYno|n&>B728A>e<)&hv`Xkkwrj-HrNt&?&TVv_r(<0i6_x-fb7xDi%Y|FA6TQcSry
z3eph8VmkniQQ+SCHQPweJD+~XfGoBDCbxMomW#MdJKvc$2iBt(+LvXiQHewfZA;{4
z1OJ{4%~R~QwMnh7t2MTyRyDP7AGNi%PB$d*M#3gSoivADjhQ*85xu#xa&@iR5T(k&
z)(qJu(Pp&Dch1aATJMDN^R4vKfe0Fax1N3`;K(S>BwT{V2dEMwlKprgM7ON8l#_=?
z#pmnWKVQCRe4YV~WH6+U(>B08NSy<(hKddtzq=Y6OXirHRl6AsLmz_n4J^?F7Z<Me
zyy`J9HFb4+u!y2fL_~yC=ry+4>EdFde=L=WiOF;4JCBg*)gzstphu}`Y2u&NPU)Lk
z=7^-=nES5!^O$$rj&VF=^V3^~h&t*S@Yn;0Y;H{PHUEW<+TT&O>>=lvt_!mfOJgm!
z=9A{@nyL41mE_k3lZ>k~p-HcF)S4h74WSia=HzFdo=|!hXYuB|jO?)O?XUH`T0joR
zkt=Ub|AXu(Wpcz^hmpprIwH^siL58E#pwlx=Fun<*&gbS86c4f2D-7CJ$xio?=0b*
zDe&Zr(a%(2*I}TqA>H~KIyy?O@t<>|U_BT;Wrxek%F1e?N;gof)m&+Kkd`AsJO8{b
z8ZA3FKkr7MyFQsFz&{WNLztGhWz7khmngTVZxc|#ZTh6lj^YZEPNhIH(mPI0JAD`-
z`mVjSdZ|Y;u}eCEk6WsRHo8)jv;nWzl{xBnpPo+4WkWf1;ZgFW5os66xw)iscak+%
zO#mS}gCcA0Az7Dk=CgUm+@+6Q@`Lhi%svk9g5Jhy0c>G<Ry;v4M6jXMSU##rut>y_
zvwWXS1SFLx%nM|k)49@Nt}>t1u_TmQQsQnw)ng`K1Fr#Jv~@%_5ZrLWPWa<#^Q}$t
zEol;nifdpn)%8w|qGeG1H^Iem6xcH|H4C^$A;j?-#vv^|@2&nya$=D$X>DYQ&A_$K
z5=<#gkd2>BM&eyW4A8Hj<ZGR*O&)ajdDCCiT2+=|rGFN%{oUZv!B<Vd>HRENmo*&6
z7>*w)N?>*}vVbAa-P%VSscgA?=zrd>39NRaORIHHKy?&|>DT1EV9F(CkwBbn?E*!>
z^6|vXbxD%=vk^!Uj5q6kJ>jyLZ|?>Hr7)=J<*mJ_+HMc1%RAYmlodUs$^s8SLc}Ex
z2!jG(MNU0CpJ}w{0M@-yDS$ID4q$6rF?wL+=lpUOpMSJOBRn}DlS5o+_LvtCk7Nmz
z_j*g+EpSm%qo@V6FMrR^&H~yw8);GCG2+Eotb?f7gW^j3(R{DX*TEh<c8d{?xtn~o
zo`~i@SG@7D)dFgYsyoTx7{97K?*_u6TiAhJa@Xve9M{cRQXenet9?c7%XlqdPQ*`~
z9*(IT;%xH+VK*Txga=;&ALGWM;n(v1ONpme=S-i7U5&HugW5#-pVQq!f{R$+i{z`R
z+n&Y{F2Ae_(k2Ls1D?M4B^+>i4o^k#m3?}4Y0Ug*FX49-1O#O+JRq0Vlr8ZUZ@sjU
zI)kBT&UQao2VZ@;^S9;E(k@dLTlD8v?Ots37N>PQ&h4Rfn=BI8&)DY=!D$+XSj2>6
z6Nr@?BM}FE7Q>5D#5SuLg(qS;Ku`y9z5T^eLy51$tiAG}G7$|Z*kJDEDaEY2X~XZI
z?HN&OtPU^iu~o&a@@KOS0XLpJZx_e0?tV;Np-Zk+Smb%d$plaJ*{nz6icYkEDF2Fb
ze&9yH&&FL1S4(*VNp6TZ=)sNE{gZp420#AMlr%5XlZx0JPvuk-=ujahc6hO!t3qU_
z6npw8MQw0b<Cb<mGP!GeLJeF5>h}5(x1G*mxx0=V@u}M!oHoB;R-@|~ZPl!^--}KS
zy0{n;7=A4`ikn{zoll~WuU{B=`)+0~jX61G^Sx@$FCX{W1P_h3!oQL$jwV-(UYiD(
z5((XLgf0JCY?Q)%R5BCs98yYi9vO5I&p3jBSgw<l_-7qnV3iC$7Hi$$)~&TH?O*gO
zp3SO(&;IWF1nl-kG%U=%IP7==mzaTa*j-38`F~;e$^`Ss$dK`<PNTe^7H@22w#eCz
z2EmjbwIa)Z8FZUrs`qMsJV*~u1rCB82p8MYNxZ;YF{GIcOtauJ(>c%|w!C~wUi520
zf6A}P*5qvICcdwu#&gJ-2K3kWR?+)Egr@a3TVn$2*H@a1u5>+D6rqvd;xdGX(+POR
z#8^6MDT=<zU=E|^MY)>Y$HW-6dnYDNr36*iV+nac@i9pQ7;Pdtf~M;kqW&+he<(in
zhZgNy-1QR*R4REi)i`1z4JQrpOS7Wv;cnemn1#(#S-g0ItJV1AA>THyFp8=NLN2ac
z4y}^djg2!E6R~*#%8}7)igwZF$77{=ADhjM_MmcL-Tj6ZGCeQ78Q+VLPs0b4J-_+n
zBgUyV2t?OvHB?3u9y-=Ks!pk9G{}WVb(==^N#}cvi1vY{j-ABW%$u#~T4krq6*g<o
zBy`|0iH6{;y%Mzl^1Hx9;nP><6hYuujO-B~POTy)?;23$GR2)#_@%Q84$ew`d+%($
zW_;S0E*N^}WhG?HbRkUsXB8!b&l1b<&59jyHk#=dYu}5#(~^U)-KvvOps#*{t;vyw
zIB^i!355-Eg_MCQ+0WiR_!qCB>DI4u@+aW7jZGF;Gc>I??*O-qAen=Pc|CfIG1yOQ
z{v%VkcJ1q&cvW^RodHQIiU}U}CFh>Z@#?hHzgd+}qQOW11X%+2q+<iabAZj8?V#bs
z2ddP7j}OJutHCT<f2jM5eLvf!WFgX`N{H^spgv3RHpaHIN$A5tEi+BV$#{vXr{8bJ
zstZr?R9vd<(M1lxb~lK=%$Dmy%S$idDz8a}))j&hQz>c>kR2$SX@Tr#{2h?qstbz4
zMoh~6zGl^||CJf6&dPq(Ku+quR3`Q0@oswUI|?&!+clr0^^cdPvC9@`M}fh08{ZlN
z?P*5(!waQrD);YqXaH9K4t!|j4FqQo@`R>Qi{0N0nzi}0IUebqXA4_VBX&L$$+WxH
zR{flho#y7Tyh!XtaRFv8PK{*&@9jIKLowe_%66~-$8W9me_qDK9@EJF<$Sv2(s5yt
z_wY81!`YQa;tCmfbr0aV?15q?8R2!-HyTOiK<L%6m8W;EuWz1{1(^g}w`vB^Wc!B)
zlssLCz0u!k=OGLTxU6pAB8G$h9nedzzBm;K4S>F~u@XmhfuGrz4z8@i>@;<O2Qmhm
zDJ(tu#8W(3;ord~?QVCx>D6<Q)*Mf*{xUuKQYu@sE<f|9^q;No6j-Y9k~x`CIE!~U
z4`V01M3fc}E4RZLuvt;{z$sXJ#?jyBv#In}0vfI3iidiY0cR_V7aZxl2W#CHnCOG?
zmaIal<3Qi%^NVtI9&0+a!XHzk0^g_rdvZ${DLums7I2w=!9baOMpt1X3uYafu31A3
zMEyIkI3u14bkz{!yviZ7=s13)^;%THC$d<#UKqNppT>APik_v;y_dlj5_qR3&GaM5
zc|;Uot(78@<?&>yGE-P`O6k~1-G~17z~_mPR8xPM4^S17-vW?Q!nzN>q<uy|P++r>
zvfO%yb9ePWSQTJ$#>e2;Dv^qfHc>r})bms9-4rysMS$}x9Eh#?`I0T$htw%BCg8*|
zWtV}R4s6oI!A$JGR5NEK1SuL!HECL9WqD}EJ$s;;B;0qAfupE|u!FXPS(<M2*`MZh
zENnqUY}xur@2^vQab#4vKW&x391_gkr+m5F)c_<O2!Dc`zxBVr{dTU>;XtYCOBhe8
zxCs=sfh(Um{%I{#(YUT>2Hxhxm>*nQv0NAiX(q!EDy&!eR{>r>BnshxxQRcO{CAb%
z;(#7AGDS4S<T*-wm2W`W?zn~U&*yobfD4742vKM6j?2pi;!W%Qk|U$a!@s3P8qr~|
zf%U1C)>;B)bpSGr2?Tz-1<)T4TE?-{b66dDQBix}W50cvL5LCr4foEe=0}wFD<#7)
zSwO56tOVBcv@LC3r|4|fnk<J{@HhFc`nt%^fWGl(j6dn{Pks@(02oT%;O_-MW&!3$
z{yBK*-jvZCruJs@z4uU=uFiPb9!n-z3joXkR}dtgu;{iU+KW~0U$27i)KGVs<TF2}
zgSpo5EWi&T1`+{52tm;N8POZs*Uas9mrbQV=`Kg;@V;2mqK4!XK6Xr|c<9$zd+fQ;
zP8*@XZ%weH%Y%~LcwZnkvkMw{BIm_YY9+2lXWH&WSJO3qj`$Ior)*1myHeugf3t+R
zfpks&@X6mWoEEfGq%d)H=V(h>|MotgPYhNEDjIcch!P1riDK6PL!zW#oCSb4IOOn5
zlt$jIL&4tXGXazx{$}RLQq!+^-y+wxU^sNBztw+XC6xm4PbT1h!cMR2xyjnUCnK9k
z@b>mL!ZLqfrFdteEJFL2dTBuco1xJx8-KQ#9SIjV>qoDL<OrkuR7NRs1mvq~63YPN
zc<Jp2A`WgjQRfs(%QTys^Wm2Vb8!O(-;Le$j$M*!KF(;FBa+$J5gs1G0(ae}Ep-ZH
z81?k@YMjg+?s!lm931$ZUEJtBwQ?nHt*m@keVz@l(E9(2=n(a{uh4eWaBu+EecTx0
zjyCeRO~MXO?Ek48p9rwh?pI7ltMFxeCNX;fS65g3!osZ@JJnmgr7xo;VMU#ttjEVK
z9Cup)7V#bjP%sNI%Kd@OAT~?3ui_+w2Hik+I13{`-mM4l*hNp@y2FG^Qj)&DzMd^>
zVj40wIT`um$$aNUR1}WbW(JHR?E!&2@`>I8g_d#tL9X6H=naYW#q>SHLN;(wpToDg
zMZTfoVP)%L8^`)V2%XV7lC-G3oyop&c8&{<Uqo^!!-rdKdsa&b^YRm8tA6-cx^>?!
zx5_K+<SwRRRxn<;DVUk%v~FURIu~zaNa^pPP(6MIRB-%mU7`eF-~G$!JKWI(vI(*|
ze9ZZP#o>X*ZTh$TnO@O=BQeGg9Grqx2b!whY~14Uc24y%xOYg+fyo5A7+!a|mz1;<
zB$P3Y8O=u(*xwJzd7<DBNkp>8MaCR5NV>`BzvI^ldXarhjhV4H+0e#F)*bBbDk2m1
zi^&=shZ^t^u$&J3lRJl&7+qjwLGJrp7H@NMdP@&~Qo!Bh79PA*7U*!>tO;He)UlV(
zzB!4xpr+zvci<e*6d>=(6(japz>9K6W7qE$NPznA-}w;(i!T->CN0fuGpJ#;<A!K8
z%yo5lf2^q)r__Z<O}2J+cQR?s$;|%f)4sfKLg|UWq_V^%nlVJ>=wglpuQh#2`6t%`
zHUoyt;2P!Ag!uTPrY1vNFO8W}ng;^<mXlQBY=Sps%X|Ec>iYeU|Bz0jhoj?AIaT#F
znb$BT=hOO!MMx-;95Ob&l+^xc=Oa6qh9*2)R`lydr0*Z4^>en^k>&qU8@28yHcO9?
zf?Oc32!zgq#G&-*)FEe%Jq0xl1tqiHH5my=6$GJWX(WV>C}*g{gEL#ZclQ(TaUo15
zl<BCmdu&Nu)Rn<BN;+?v!g40_LRLfchU(Cv87*W4wD6Mk7HWi@UGv9|EnCHRxA{!-
z?#p&|ZXdJ|x+T{!?!X_Qo8xz1celE*|4d6ykBpBev*dzPjis1*Cy#rN6jp<ol)SvW
z=t86JDrjwu<~N=s4e*!(c`^~0D8LbEX6zw7thi~a-BM!nE))ecH@D2VnR#q(@^>d~
zGEzm9&>gBvz5sK?#ajP+sLqW_qD_#-fjn_!U^=j)laq==*1kk89+!MB$v(c7u)V*3
zr1h2aiRN77VP<Vn@?Yh?C<a6F1)i;Erl#4)OBVH+ij4BjUFOiDH}x=C+0Gx>G+h4L
z^v)n2>r770jCxiYjdssd;w4&ui)Io#N4h6ZDDeU-$8A3#=*cBMq^s-8S@(IJM4-LB
zef;NYP(0Q_g`#S%AHpHGq1w5!FQl5U|C8X8JTzt`^jlIODahB?uekX*wb{@6nuq6M
zq14^lx`i)_*#@aozeA3&Lt&;i78danDr~E@%n!fv4@JQR>mlp^;G?9JyskWfA+{z^
z45H?JQ`9^C`)h-x9GojSA&HON-0y2WdU`NI)Iz)ik@3Duc^XH-tyg7_=g8l>DR8C`
zWU_>7ys5wD?)1M+MXaHBmp2V|em+-_d75b0Yg+l7xvsMW?&c<O7g65f``kRFpsA2U
z>KX27ZNDf;QS<QdFtfT>mkaEDHog#gG8hS?SiCq|Jo9-4!I~Is(RkdO@_k}YR9u4g
z5iLD!KadxXtG#N-iYJm!!TjNiJo_Fr^WM|SI2o=L7;9WdLH|x&=QjVpjyCsQ?(@g%
z_sE{B(A~39>e&%QxXknT`l_jBt0+fu-{&UK(@T_W5Gy+1Mx+%L7c0U+9ZP1YNyN&y
zFCmjo$nPD^lLs6sH!0JFKdyWMez_1=wd|6`2#XK)vA;?han~%fTW+pHbDZ+8&UQUK
zTA5>ykGxRwQ;(`6c2+6p2>8lO>i9@fi$$JY2<rqdIpxB7{7rl`<1tH|c;xIa@GOi0
z^gmO&s`Gp5vI@OGS%ix7=M1r?+f*{=>**6X8OD8gXv{sKlo#i!$fwr}FYshGfWo|R
z^4AyB^MQfKP0h25qLbcTi`Qy3nG<pA{cpt-SNY`%$%q#cL_pgwhr3DeC^t(EjAY`)
zASPa`J^eo*%>Q33yX=zSm({IzFOH_DWhLF&%W6x>ST*TJ!DE5w_6DsxdQUI+ur@X}
z;`!{lY5Ae2g7|kv!KXAA8wi*L&+Re#^rT;*0dyc<;M?lwW>GLG3V<UJ<#ADV5Scjq
zZQyXYKF1J-QN+MFiY7nxiGd4L@U*lsz}qbzfxuUrQg_$-86)T$7ryYOG9M%!a}-hH
l{{{G;*Vn~k9H`Dhqx{nf^4|Nt|L+GxT}4~DMiCYM{{Z8x!_@!)

literal 0
HcmV?d00001

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 527f2de252..87af68874b 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -152,9 +152,12 @@ def build_stitched_design(
 
         spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)
-        ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')
+        ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')     
+        stdout_log = os.path.join(stitched_design_dir, 'stitcher_stdout.log')
+        stderr_log = os.path.join(stitched_design_dir, 'stitcher_stderr.log')
         nn_config_path = os.path.join(stitched_design_dir, 'nn_config.json')
         testbench_path =  os.path.join(stitched_design_dir, 'testbench.v')
+        testbench_log_path = os.path.join(stitched_design_dir, 'testbench_log.csv')
 
         try:
             shutil.copy(ip_stitcher_path, stitched_design_dir)
@@ -167,20 +170,18 @@ def build_stitched_design(
         
         if(sim_stitched_design):
             write_verilog_testbench(nn_config, testbench_path)
-
-            # Produce testbench input file for every input layer
+            # Produce a testbench input file for every input layer
             for i, layer in enumerate(nn_config['inputs']):
-                layer_name = layer['name']
-                frac_bits = layer['fractional_bits']
-                total_bits = layer['fractional_bits'] + layer['integer_bits']
-                testbench_input_path = os.path.join(stitched_design_dir, f"{layer_name}_input_data.txt")
-                # We reshape simulation input data to (fifo_depth, batch_size)
+                testbench_input_path = os.path.join(stitched_design_dir, f"{layer['name']}_input_data.txt")
+                # We reshape input simulation data to (fifo_depth, batch_size)
                 if simulation_input_data is None:
                     input_data_reshaped = prepare_zero_input(layer)
+                    print("No simulation input provided. Using zero-filled inputs.")
                 else:
+                    # Handles both single and multi-layer cases. First dim should always be batch size
                     data = simulation_input_data[i]
                     input_data_reshaped = prepare_testbench_input(data, layer['fifo_depth'], layer['batch_size'])
-                write_testbench_input(input_data_reshaped, testbench_input_path, frac_bits, total_bits)
+                write_testbench_input(input_data_reshaped, testbench_input_path, layer['integer_bits'], layer['fractional_bits'])
             print('Verilog testbench and its input data was generated.')
 
         print('Running build process of stitched IP...\n')
@@ -194,9 +195,6 @@ def build_stitched_design(
             f'stitch_project_name={project_name}',
             f'sim_verilog_file={os.path.join(project_name, "testbench.v")}'
         ]
-                
-        stdout_log = os.path.join(stitched_design_dir, 'stitcher_stdout.log')
-        stderr_log = os.path.join(stitched_design_dir, 'stitcher_stderr.log')
         
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
             process = subprocess.Popen(
@@ -211,21 +209,14 @@ def build_stitched_design(
             if process.returncode != 0:
                 raise Exception(f'Stitching failed for {project_name}. See logs for details.')
         
-        stitched_report = {}
+        stitched_report = {'StitchedDesignReport': {}}
         if stitch_design:
             stitched_report = aggregate_graph_reports(graph_reports)
 
-        if sim_stitched_design :
-            testbench_log_path = os.path.join(stitched_design_dir, project_name + '.sim/sim_1/behav/xsim/testbench_log.csv')
+        if sim_stitched_design:
             testbench_output = read_testbench_log(testbench_log_path)
-
-            behavioral_sim_results = []
-            for name, arr in testbench_output['outputs'].items():
-                arr_str = [f"{val:.6f}" for val in arr]
-                behavioral_sim_results.append(arr_str)
-            stitched_report['BehavSimResults'] = behavioral_sim_results
-            if stitch_design:
-                stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['BestLatency']
-                stitched_report['StitchedDesignReport']['WorstLatency'] = testbench_output['WorstLatency']
+            stitched_report['BehavSimResults'] = testbench_output['BehavSimResults']
+            stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['BestLatency']
+            stitched_report['StitchedDesignReport']['WorstLatency'] = testbench_output['WorstLatency']
 
         return stitched_report
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 458c5aad04..634b733fa6 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1074,7 +1074,6 @@ def build(self, stitch_design=False, sim_stitched_design=False, export_stitched_
         status = {}
         status_lock = threading.Lock()
 
-        # Initialize statuses
         for g in self.graphs:
             project_name = g.config.get_project_name()
             status[project_name] = 'Pending'
@@ -1128,14 +1127,14 @@ def compile(self):
         for g in self.graphs:
             g.compile()
 
-    def predict(self, x, sim_stitched_design = False):
-        if not sim_stitched_design:
+    def predict(self, x, sim = 'csim'):
+        if sim == 'csim':
             input_data = x
             for g in self.graphs:
                 output_data = g.predict(input_data)
                 input_data = output_data
             return output_data
-        else:
+        elif sim == 'rtl':
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
                 output_dir=self.output_dir,
@@ -1146,8 +1145,10 @@ def predict(self, x, sim_stitched_design = False):
                 nn_config=nn_config,
                 graph_reports=self.graph_reports,
                 simulation_input_data=x)
-            return stitched_report
-    
+            return stitched_report['BehavSimResults']
+        else: 
+            print('Unknown simulation option given.')
+            
     def trace(self, x):
         # TODO: finish trace function
         input_data = x
@@ -1172,7 +1173,7 @@ def _print_status(self, status):
     def _replace_logos(self):
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
-        hls4ml_logo = os.path.join(hls4ml_path, '../docs/img/logo.png')
+        hls4ml_logo = os.path.join(hls4ml_path, '../docs/img/logo_small.png')
 
         if not os.path.isfile(hls4ml_logo):
             raise FileNotFoundError(f"hls4ml logo not found at: {hls4ml_logo}")
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index eee2192172..0279614b17 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -213,6 +213,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('    // Logging and Measurement Variables\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    integer csv_file;\n')
+        f.write('    integer file, r, value;\n')
         f.write('    integer j;\n')
         f.write('    integer total_bits;\n')
         f.write('    reg [63:0] cycle_count = 0;\n')
@@ -248,7 +249,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        repeat (2) @(posedge ap_clk);\n\n')
 
         f.write('        // Open CSV log file\n')
-        f.write('        csv_file = $fopen("testbench_log.csv", "w");\n')
+        f.write('        csv_file = $fopen("../../../../testbench_log.csv", "w");\n')
         f.write('        if (csv_file == 0) begin\n')
         f.write('            $display("ERROR: Could not open CSV log file.");\n')
         f.write('            $finish;\n')
@@ -278,7 +279,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write('        end\n')
             f.write(f'        {name}_tvalid = 0;\n\n')
 
-        # Send second pattern of inputs
+        # Send second pattern of inputs (read from file)
         for layer in nn_config['inputs']:
             i_bits = layer["integer_bits"]
             f_bits = layer["fractional_bits"]
@@ -286,13 +287,21 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             batch_size = layer['batch_size']
             fifo_depth = layer["fifo_depth"]
             name = layer["name"]
+            input_file = f"{name}_input_data.txt"
             f.write(f'        // Sending 2nd pattern of inputs for {name}\n')
             f.write(f'        {name}_tvalid = 1;\n')
+            f.write(f'        file = $fopen("../../../../{input_file}", "r");\n')
+            f.write(f'        if (file == 0) begin\n')
+            f.write(f'            $display("Error opening file {input_file}");\n')
+            f.write(f'            $finish;\n')
+            f.write(f'        end\n')
             f.write(f'        for (j = 0; j < {fifo_depth}; j = j + 1) begin\n')
+            # For each line, read batch_size values:
             for k in range(batch_size):
                 upper = (k + 1) * total_bits - 1
                 lower = k * total_bits
-                f.write(f'            {name}_tdata[{upper}:{lower}] = 1 << {f_bits};\n')
+                f.write(f'            r = $fscanf(file, "%d", value);\n')
+                f.write(f'            {name}_tdata[{upper}:{lower}] = value;\n')
             f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
             f.write('            @(posedge ap_clk);\n')
             f.write('        end\n')
@@ -361,8 +370,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
 
         f.write('endmodule\n')
 
-def float_to_fixed(float_value, fractional_bits=10, total_bits=16):
+def float_to_fixed(float_value, integer_bits=6, fractional_bits=10):
     scaling_factor = 1 << fractional_bits
+    total_bits = integer_bits + fractional_bits
     max_val = (1 << (total_bits - 1)) - 1
     min_val = -(1 << (total_bits - 1))
 
@@ -376,22 +386,22 @@ def float_to_fixed(float_value, fractional_bits=10, total_bits=16):
 
     return fixed_value
 
-def write_testbench_input(floats, file_name, fractional_bits=10, total_bits=16):
+def write_testbench_input(float_inputs, file_name, integer_bits=6, fractional_bits=10):
     """
     Convert 1D or 2D arrays (or lists of floats) to fixed-point and write to file.
 
-    If 'floats' is 1D: writes a single line.
-    If 'floats' is 2D: flattens each row and writes one line per row.
+    If 'float_inputs' is 1D: writes a single line.
+    If 'float_inputs' is 2D: flattens each row and writes one line per row.
     """
     with open(file_name, "w") as f:
-        if len(floats) > 0 and isinstance(floats[0], (list, np.ndarray)):
-            for row in floats:
+        if len(float_inputs) > 0 and isinstance(float_inputs[0], (list, np.ndarray)):
+            for row in float_inputs:
                 row_array = np.array(row).ravel()  # flatten if necessary
-                fixed_line = [float_to_fixed(val, fractional_bits, total_bits) for val in row_array]
+                fixed_line = [float_to_fixed(val, integer_bits, fractional_bits) for val in row_array]
                 f.write(" ".join(map(str, fixed_line)) + "\n")
         else:
-            flattened = np.array(floats).ravel()  # ensure it's a flat array of scalars
-            fixed_line = [float_to_fixed(val, fractional_bits, total_bits) for val in flattened]
+            flattened = np.array(float_inputs).ravel()  # ensure it's a flat array of scalars
+            fixed_line = [float_to_fixed(val, integer_bits, fractional_bits) for val in flattened]
             f.write(" ".join(map(str, fixed_line)) + "\n")
 
 
@@ -403,7 +413,6 @@ def prepare_zero_input(layer):
 
 def prepare_testbench_input(data, fifo_depth, batch_size):
     data_arr = np.array(data)
-    # Flatten the data and then reshape it
     # Ensure that total elements = fifo_depth * batch_size
     total_elements = fifo_depth * batch_size
     if data_arr.size != total_elements:
@@ -430,16 +439,16 @@ def read_testbench_log(testbench_log_path):
         sim_dict = {
             'BestLatency': int(BestLatency),
             'WorstLatency': int(WorstLatency),
-            'outputs': {}
+            'BehavSimResults': []
         }
 
         grouped = output_df.groupby('output_name')
         for name, group in grouped:
             indices = group['index'].astype(int)
             values = group['value'].astype(float)
-            array = np.zeros(max(indices) + 1, dtype=float)
+            array = np.zeros(max(indices) + 1, dtype=np.float64)
             array[indices] = values
-            sim_dict['outputs'][name] = array
+            sim_dict['BehavSimResults'].append(array)
 
         return sim_dict
 

From 0af75e79c150379e2eaf5c9d85c25e7df176959d Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 19 Dec 2024 15:49:28 +0100
Subject: [PATCH 26/50] fix for multi input/output layers in graph splitting

---
 hls4ml/converters/keras_to_hls.py |  2 +-
 hls4ml/model/graph.py             | 15 ++++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index c9dd749296..bdc68da563 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -331,7 +331,7 @@ def keras_to_hls(config, split_layer_names = None):
     if split_layer_names:
         if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
             raise ValueError(f'Split layer must not be a merge layer')
-        hls_model = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names)
+        hls_model = ModelGraph.make_multi_graph(config, layer_list, input_layers, output_layers, output_shapes, split_layer_names)
         print('Multi-graph HLS model created.')
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 634b733fa6..f990dece0a 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -905,7 +905,7 @@ def build(self, **kwargs):
         return self.config.backend.build(self, **kwargs)
 
     @classmethod
-    def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
+    def make_multi_graph(cls, config, layer_list, input_layers, output_layers, output_shapes, split_layer_names):
         """Splits the layer list at the specified layers and creates multiple ModelGraphs.
 
         Args:
@@ -929,12 +929,12 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         # Split the layer_list into subgraphs
         split_indices = sorted([layer_names.index(name) for name in split_layer_names])
         indices = [0] + split_indices + [len(layer_list)]
-        subgraphs_layer_lists = []
+        subgraph_layer_lists = []
         for i in range(len(indices) - 1):
             start = indices[i]
             end = indices[i + 1]
             sub_layer_list = layer_list[start:end]
-            subgraphs_layer_lists.append(sub_layer_list)
+            subgraph_layer_lists.append(sub_layer_list)
 
         # Create ModelGraphs for each subgraph
         model_graphs = []
@@ -942,7 +942,7 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
         original_ProjectName = config['ProjectName']
         current_index = 0
         last_output_precision = None
-        for idx, sub_layer_list in enumerate(subgraphs_layer_lists):
+        for idx, sub_layer_list in enumerate(subgraph_layer_lists):
             
             # Create a shallow copy of the config for each subgraph
             sub_config = copy.copy(config)
@@ -989,7 +989,12 @@ def make_multi_graph(cls, config, layer_list, output_shapes, split_layer_names):
                 else:
                     pass # case of granularity='Model'
             
-            hls_model = ModelGraph(sub_config, sub_layer_list, None, None, initial_index=current_index)
+            graph_output_layers = output_layers if idx == len(subgraph_layer_lists) - 1 else None
+            graph_input_layers = input_layers if idx == 0 else None
+            hls_model = ModelGraph(sub_config, sub_layer_list, 
+                                   graph_input_layers, 
+                                   graph_output_layers, 
+                                   initial_index=current_index)
 
             # After creating subgraph, get the precision from the last layer's output. 
             if hls_model.graph:

From db956284210cde4ad758f190dc8620eb9252ad25 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 20 Dec 2024 16:37:16 +0100
Subject: [PATCH 27/50] documentation for MultiModelGraph flow

---
 docs/ir/multimodelgraph.rst | 128 ++++++++++++++++++++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 docs/ir/multimodelgraph.rst

diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
new file mode 100644
index 0000000000..b92cc28035
--- /dev/null
+++ b/docs/ir/multimodelgraph.rst
@@ -0,0 +1,128 @@
+=======================
+MultiModelGraph Class
+=======================
+
+This page documents the ``MultiModelGraph`` class, which enables handling multiple subgraphs (each represented as a ``ModelGraph``) derived from a single original model. 
+The central concept here is the division of a larger model into multiple smaller subgraphs at given layers which can be useful for: 
+
+* Very large models
+* Step-wise optimization
+* Modular design flows
+
+A ``MultiModelGraph`` manages these subgraphs, facilitating:
+
+* Parallel building and synthesis
+* Stitched designs (merging the subgraphs in HW after synthesis)
+* Simulation and performance estimation of the stitched design
+
+--------------
+Keras Example
+--------------
+
+For example, when converting a Keras model, you can specify the layers at which to split the model directly:
+
+.. code-block:: python
+
+   config = hls4ml.utils.config_from_keras_model(model, granularity='model')
+
+   hls_model = hls4ml.converters.convert_from_keras_model(
+       model, 
+       hls_config=config, 
+       backend='vitis',
+       split_layer_names = ['layer3', 'layer7']
+   )
+
+Here, the ``hls_model`` is actually a ``MultiModelGraph`` containing three subgraphs. Each subgraph is a ``ModelGraph`` accessible via indexing: ``hls_model[i]``.
+
+
+----------------------------------
+Key Methods for MultiModelGraph
+----------------------------------
+
+* :ref:`compile <mmg-compile-method>`  
+* :ref:`predict <mmg-predict-method>`  
+* :ref:`build <mmg-build-method>`  
+* :ref:`trace <mmg-trace-method>`  
+* :ref:`make_multi_graph <make_multi_graph-method>`
+
+----
+
+.. _make_multi_graph-method:
+
+``make_multi_graph`` method
+===========================
+
+The ``make_multi_graph`` method of ``ModelGraph`` takes a configuration, a full list of layers, the output shapes, and a list of split layers. It returns a ``MultiModelGraph`` that contains multiple ``ModelGraph`` instances.
+
+.. code-block:: python
+
+   from my_hls4ml_lib.modelgraph import ModelGraph
+   multi_graph = ModelGraph.make_multi_graph(config, layer_list, output_shapes, split_layer_names=['fc2', 'fc3'])
+
+This allows modular design flows and easier debugging of large models.
+
+----
+
+.. _mmg-compile-method:
+
+``compile`` method
+==================
+
+Compiles all the individual ``ModelGraph`` subgraphs within the ``MultiModelGraph``.
+
+.. code-block:: python
+
+   multi_graph.compile()
+
+----
+
+.. _mmg-build-method:
+
+``build`` method
+================
+
+Builds all subgraphs in parallel, each as if they were standalone ``ModelGraph`` projects. Returns reports for each subgraph. If configured, it then runs the stitching flow in Vivado, connecting the individual exported IPs and allowing you to simulate the stitched design at the RTL level.
+
+.. code-block:: python
+
+   report = multi_graph.build(export=True, stitch_design=True)
+
+The returned ``report`` contains data from each subgraph's build and, if stitching was performed, a combined report of the stitched design.
+
+
+----
+
+.. _mmg-predict-method:
+
+``predict`` method
+==================
+
+Performs a forward pass through the chained sub-models using the C-simulation (``sim='csim'``). Data is automatically passed from one subgraph's output to the next subgraph's input. For large stitched designs, you can also leverage RTL simulation (``sim='rtl'``) to perform the forward pass at the register-transfer level. In this case, a Verilog testbench is dynamically generated and executed against the stitched IP design, providing behavioral simulation to accurately verify latency and output at the hardware level.
+
+.. code-block:: python
+
+   # Perform prediction using C-simulation (default)
+   y_csim = hls_model.predict(X, sim='csim')
+
+   # Perform prediction using RTL simulation (behavioral)
+   y_rtl = hls_model.predict(X, sim='rtl')
+
+
+.. _mmg-trace-method:
+
+``trace`` method
+================
+
+Provides detailed layer-by-layer outputs across all sub-models, which is essential for debugging or tuning quantization and precision settings.
+
+.. code-block:: python
+
+   final_output, trace_outputs = hls_model.trace(X)
+
+``trace_outputs`` includes intermediate results from each subgraph, enabling insights into the data flow.
+
+--------------------------
+Summary
+--------------------------
+
+The ``MultiModelGraph`` class is a powerful tool for modular hardware design. By splitting a large neural network into multiple subgraphs, building each independently, and then stitching them together, you gain flexibility, parallelism, and a clear path to advanced workflows such as hierarchical design, incremental optimization, and integrated system-level simulations.

From 738d4895f209700c3c65b5a989955ca12f6d81ba Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 8 Jan 2025 17:45:09 +0100
Subject: [PATCH 28/50] faster rtl simulation

---
 hls4ml/templates/vivado/ip_stitcher.tcl | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 7c42c90036..5090608c35 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -576,11 +576,21 @@ if {$sim_design} {
     update_compile_order -fileset sim_1
     puts "Simulation Verilog file added: $base_dir/$sim_verilog_file"
     set_property top tb_design_1_wrapper [get_filesets sim_1]
-    set_property -name {xsim.simulate.runtime} -value {800000ns} -objects [get_filesets sim_1]
+    set_property -name {xsim.simulate.runtime} -value {1000000ns} -objects [get_filesets sim_1]
+
+   # Check if snapshot already exists
+    set snapshot_name "tb_design_1_wrapper_behav"
+    set xsim_folder_path "${base_dir}/${stitch_project_name}/vivado_stitched_design.sim/sim_1/behav/xsim"
     puts "##########################"
-    puts "#  Launching simulation  #"
+    puts "#  Running Simulation... #"
     puts "##########################"
-    launch_simulation
+    if {[file exists "${xsim_folder_path}/${snapshot_name}.wdb"]} {
+        puts "Using existing snapshot..."
+        cd $xsim_folder_path
+        exec xsim $snapshot_name -R
+    } else {
+        launch_simulation
+    }
 }
 
 

From 7829e41d132e3ec2e5494fd2ef5c4222a499d297 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 10 Jan 2025 17:10:04 +0100
Subject: [PATCH 29/50] unwrap list if it has single element

---
 hls4ml/utils/simulation_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 0279614b17..162c06f9e8 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -450,6 +450,9 @@ def read_testbench_log(testbench_log_path):
             array[indices] = values
             sim_dict['BehavSimResults'].append(array)
 
+        if len(sim_dict['BehavSimResults']) == 1:
+            sim_dict['BehavSimResults'] = sim_dict['BehavSimResults'][0]
+
         return sim_dict
 
     except (KeyError, IndexError) as e:

From f9fd4c0abec3d0380db07d67b441a5e2f879a50f Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 15 Jan 2025 11:02:49 +0100
Subject: [PATCH 30/50] Make MultiModelGraph adaptable to user-defined names

---
 hls4ml/backends/vitis/vitis_backend.py  | 25 ++++++++-------
 hls4ml/model/graph.py                   | 41 +++++++++++++++++++------
 hls4ml/templates/vivado/ip_stitcher.tcl | 40 ++++++++++++------------
 3 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 87af68874b..8e9f3a1490 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -134,20 +134,22 @@ def build(
     
     def build_stitched_design(
         self,
-        output_dir,
-        project_name,
         stitch_design=True,
         sim_stitched_design=False,
         export_stitched_design=False,
         nn_config=None,
         graph_reports=None,
-        simulation_input_data=None):    
+        simulation_input_data=None):  
 
-        os.makedirs(output_dir, exist_ok=True)
-        stitched_design_dir = os.path.join(output_dir, project_name)
+        OutputDir = nn_config['OutputDir']
+        VivadoProjectName = nn_config['VivadoProjectName']
+        OriginalProjectName = nn_config['OriginalProjectName']
+
+        os.makedirs(OutputDir, exist_ok=True)
+        stitched_design_dir = os.path.join(OutputDir, VivadoProjectName)
         if stitch_design:
             if os.path.exists(stitched_design_dir):
-                raise FileExistsError(f"The directory '{stitched_design_dir}' already exists.")
+                print(f"WARNING: The directory '{stitched_design_dir}' already exists.")
             os.makedirs(stitched_design_dir)
 
         spec = importlib.util.find_spec('hls4ml')
@@ -162,7 +164,7 @@ def build_stitched_design(
         try:
             shutil.copy(ip_stitcher_path, stitched_design_dir)
         except Exception as e:
-            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {project_name} folder.")
+            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {VivadoProjectName} folder.")
 
         if nn_config:
             with open(nn_config_path, "w") as file:
@@ -192,14 +194,15 @@ def build_stitched_design(
             f'stitch_design={int(stitch_design)}',
             f'sim_design={int(sim_stitched_design)}',
             f'export_design={int(export_stitched_design)}',
-            f'stitch_project_name={project_name}',
-            f'sim_verilog_file={os.path.join(project_name, "testbench.v")}'
+            f'stitch_project_name={VivadoProjectName}',
+            f'original_project_name={OriginalProjectName}',
+            f'sim_verilog_file=testbench.v'
         ]
         
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
             process = subprocess.Popen(
                 stitch_command,
-                cwd=output_dir,
+                cwd=stitched_design_dir,
                 stdout=stdout_file,
                 stderr=stderr_file,
                 text=True,
@@ -207,7 +210,7 @@ def build_stitched_design(
             )
             process.communicate()
             if process.returncode != 0:
-                raise Exception(f'Stitching failed for {project_name}. See logs for details.')
+                raise Exception(f'Stitching failed for {VivadoProjectName}. See logs for details.')
         
         stitched_report = {'StitchedDesignReport': {}}
         if stitch_design:
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index f990dece0a..4bc1306b1b 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -946,8 +946,8 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
             
             # Create a shallow copy of the config for each subgraph
             sub_config = copy.copy(config)
-            sub_config['OutputDir'] = f"{original_OutputDir}_graph{idx + 1}"
-            sub_config['ProjectName'] = f"{original_ProjectName}_graph{idx + 1}"
+            sub_config['OutputDir'] = os.path.join(original_OutputDir, f'graph{idx + 1}')
+            sub_config['ProjectName'] = f'{original_ProjectName}_graph{idx + 1}'
 
             # For subgraphs after the first one, configure new input layer
             if idx > 0:
@@ -1022,16 +1022,40 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
 class MultiModelGraph:
     def __init__(self, graphs):
         self.graphs = graphs
-        self.project_name = 'vivado_stitched_design'
-        self.output_dir = graphs[0].config.get_output_dir().split('/')[0]
-        self.backend = self.graphs[0].config.backend
+        self.config = copy.copy(self.graphs[0].config)
+        self._deepcopy_config_names(self.graphs[0].config.config)
+        self._initialize_config(graphs[0])
+        self.config.config['VivadoProjectName'] = 'vivado_stitched_design'
+        self.backend = graphs[0].config.backend
         self.graph_reports = None
-    
+        self._top_function_lib = None
+        self._compile = ModelGraph._compile.__get__(self, MultiModelGraph)
+
+    def _initialize_config(self, first_graph):
+        """
+        Initialize the configuration using details from the first graph
+        """
+        original_project_name = first_graph.config.get_project_name().partition('_graph')[0]
+        self.config.config['ProjectName'] = f"{original_project_name}_stitched"
+        self.config.config['OriginalProjectName'] = original_project_name
+        original_output_dir = first_graph.config.get_output_dir().partition('/graph')[0]       
+        self.config.config['OutputDir'] = os.path.join(original_output_dir, 'stitched')
+
+    def _deepcopy_config_names(self, config):
+        # Deep copy only 'ProjectName' and 'OutputDir', shallow copy others
+        keys_to_deepcopy = ['ProjectName', 'OutputDir']
+        self.config.config = {k: copy.deepcopy(config[k]) 
+                            if k in keys_to_deepcopy 
+                            else config[k] for k in config}
+
     def __getitem__(self, index):
         return self.graphs[index]
     
     def parse_nn_config(self):
         nn_config = {"inputs": [], "outputs": []}
+        nn_config['OutputDir'] = self.config.config['OutputDir']
+        nn_config['VivadoProjectName'] = self.config.config['VivadoProjectName']
+        nn_config['OriginalProjectName'] = self.config.config['OriginalProjectName']
 
         # Parse layers (inputs and outputs)
         for graph, io_type in [(self.graphs[0], "inputs"), (self.graphs[-1], "outputs")]:
@@ -1117,8 +1141,6 @@ def build_wrapper(g, **kwargs):
         if stitch_design or sim_stitched_design or export_stitched_design:
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
-                output_dir=self.output_dir,
-                project_name=self.project_name,
                 stitch_design=stitch_design,
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
@@ -1131,6 +1153,7 @@ def build_wrapper(g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.compile()
+        #self._compile()
 
     def predict(self, x, sim = 'csim'):
         if sim == 'csim':
@@ -1142,8 +1165,6 @@ def predict(self, x, sim = 'csim'):
         elif sim == 'rtl':
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
-                output_dir=self.output_dir,
-                project_name=self.project_name,
                 stitch_design=False,
                 sim_stitched_design=True,
                 export_stitched_design=False,
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 5090608c35..18bdccdacd 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -14,6 +14,7 @@ array set opt {
     sim_design              0
     export_design           0
     stitch_project_name     ""
+    original_project_name   ""
     sim_verilog_file        ""
 }
 
@@ -34,18 +35,21 @@ set sim_design [expr {$opt(sim_design)}]
 set export_design [expr {$opt(export_design)}]
 set sim_verilog_file $opt(sim_verilog_file)
 set stitch_project_name $opt(stitch_project_name)
+set original_project_name $opt(original_project_name)
 
 # Project base dir
 set base_dir [pwd]
+set original_project_path "$base_dir/../../"
+puts $base_dir
 # Name of the block design 
 set bd_name "stitched_design"
 
-# Find a directory that ends with "graph1", "graph2", etc.
-set project_dirs [glob -nocomplain -directory $base_dir *graph[0-9]]
+# Find a directory that ends with "graph1", "graph2", etc. in the parent project folder
+set project_dirs [glob -nocomplain -directory $original_project_path *graph[0-9]]
 
 # Check if a matching directory is found
 if {[llength $project_dirs] == 0} {
-    puts "Error: No project directory ending with 'graph{id}' found in $base_dir"
+    puts "Error: No project directory ending with 'graph{id}' found in $original_project_path"
 } else {
     # Get the first matching directory
     set project_dir [lindex $project_dirs 0]
@@ -62,7 +66,7 @@ if {[llength $project_dirs] == 0} {
 }
 
 # Procedure for stitching the project
-proc stitch_procedure {base_dir stitch_project_name bd_name part} {
+proc stitch_procedure {base_dir stitch_project_name original_project_name bd_name part} {
 
     puts "###########################################################"
     puts "#   Starting the IP connection process...                  "
@@ -70,16 +74,14 @@ proc stitch_procedure {base_dir stitch_project_name bd_name part} {
 
 
     # Create New Vivado Project
-    file mkdir $stitch_project_name
-    cd $stitch_project_name
-    create_project $stitch_project_name . -part $part
+    create_project $stitch_project_name . -part $part -force
 
     # Add repositories
     # Initialize the repo count
     set repo_count 0
     # Loop through potential project directories
-    for {set i 1} {[file exists "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj"]} {incr i} {
-        set repo_path "$base_dir/hls4ml_prj_graph$i/myproject_graph${i}_prj/solution1/impl/ip"
+    for {set i 1} {[file exists "$base_dir/graph$i/${original_project_name}_graph${i}_prj"]} {incr i} {
+        set repo_path "$base_dir/graph$i/${original_project_name}_graph${i}_prj/solution1/impl/ip"
         # Check if the repository path exists
         if {[file isdirectory $repo_path]} {
             # Add repository path to current project's IP repository paths
@@ -106,14 +108,14 @@ proc stitch_procedure {base_dir stitch_project_name bd_name part} {
 
     # Add IPs to block design
     for {set i 1} {$i <= $repo_count} {incr i} {
-        set vlnv "xilinx.com:hls:myproject_graph$i:1.0"
-        create_bd_cell -type ip -vlnv $vlnv "myproject_graph${i}_0"
+        set vlnv "xilinx.com:hls:${original_project_name}_graph${i}:1.0"
+        create_bd_cell -type ip -vlnv $vlnv "${original_project_name}_graph${i}_0"
     }
 
     # Collect all IP instance names in a list
     set ip_instances {}
     for {set i 1} {$i <= $repo_count} {incr i} {
-        set ip_name "myproject_graph${i}_0"
+        set ip_name "${original_project_name}_graph${i}_0"
         lappend ip_instances $ip_name
     }
 
@@ -532,14 +534,14 @@ proc stitch_procedure {base_dir stitch_project_name bd_name part} {
 }
 
 if {$stitch_design} {
-    stitch_procedure $base_dir $stitch_project_name $bd_name $part
+    stitch_procedure $original_project_path $stitch_project_name $original_project_name $bd_name $part
 } else {
-    set existing_stitch_project_name [file join $stitch_project_name "$stitch_project_name.xpr"]
-    if {[file exists $existing_stitch_project_name]} {
-        puts "Opening existing project: $existing_stitch_project_name"
-        open_project $existing_stitch_project_name
+    #set existing_stitch_project_name [file join $stitch_project_name "$stitch_project_name.xpr"]
+    if {[file exists "$stitch_project_name.xpr"]} {
+        puts "Opening existing project: $stitch_project_name.xpr"
+        open_project "$stitch_project_name.xpr"
     } else {
-        puts "Error: Project file '$existing_stitch_project_name' does not exist."
+        puts "Error: Project file "$stitch_project_name.xpr" does not exist."
         exit 1
     }
 }
@@ -580,7 +582,7 @@ if {$sim_design} {
 
    # Check if snapshot already exists
     set snapshot_name "tb_design_1_wrapper_behav"
-    set xsim_folder_path "${base_dir}/${stitch_project_name}/vivado_stitched_design.sim/sim_1/behav/xsim"
+    set xsim_folder_path "${base_dir}/vivado_stitched_design.sim/sim_1/behav/xsim"
     puts "##########################"
     puts "#  Running Simulation... #"
     puts "##########################"

From 05ea6c9c9bee92ec282a79651ccd110a718a1210 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 15 Jan 2025 15:31:48 +0100
Subject: [PATCH 31/50] stitch script time verbose

---
 hls4ml/templates/vivado/ip_stitcher.tcl | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 18bdccdacd..9b6efe1b2c 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -534,7 +534,13 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
 }
 
 if {$stitch_design} {
+    set start_time [clock seconds]
     stitch_procedure $original_project_path $stitch_project_name $original_project_name $bd_name $part
+    set end_time [clock seconds]
+    set elapsed_time [expr {$end_time - $start_time}]
+    puts "====================================================="
+    puts "\[Stitch\] Elapsed Time : $elapsed_time seconds"
+    puts "====================================================="
 } else {
     #set existing_stitch_project_name [file join $stitch_project_name "$stitch_project_name.xpr"]
     if {[file exists "$stitch_project_name.xpr"]} {
@@ -547,6 +553,7 @@ if {$stitch_design} {
 }
 
 if {$export_design} {
+    set start_time [clock seconds] 
     puts "Exporting stitched IP..."
     set stitched_ip_dir "ip_repo"
     ipx::package_project -root_dir $stitched_ip_dir \
@@ -559,9 +566,13 @@ if {$export_design} {
     ipx::check_integrity [ipx::find_open_core user.org:user:stitched_design:1.0]
     ipx::save_core [ipx::find_open_core user.org:user:stitched_design:1.0]
     puts "Stitched IP has been exported to '$stitched_ip_dir' folder"
+    puts "====================================================="
+    puts "\[Export\] Elapsed Time : $elapsed_time seconds"
+    puts "====================================================="
 }
 
 if {$sim_design} {
+    set start_time [clock seconds] 
     if {$sim_verilog_file == ""} {
         puts "Error: sim_verilog_file not provided."
         exit 1
@@ -593,6 +604,11 @@ if {$sim_design} {
     } else {
         launch_simulation
     }
+    set end_time [clock seconds]
+    set elapsed_time [expr {$end_time - $start_time}]
+    puts "====================================================="
+    puts "\[Simulation\] Elapsed Time : $elapsed_time seconds"
+    puts "====================================================="
 }
 
 

From 193381d7c828c039d8bae36d1130e81a324925e1 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 15 Jan 2025 15:48:14 +0100
Subject: [PATCH 32/50] fix with existing stitch project folder

---
 hls4ml/backends/vitis/vitis_backend.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 8e9f3a1490..9935ee011d 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -148,9 +148,8 @@ def build_stitched_design(
         os.makedirs(OutputDir, exist_ok=True)
         stitched_design_dir = os.path.join(OutputDir, VivadoProjectName)
         if stitch_design:
-            if os.path.exists(stitched_design_dir):
-                print(f"WARNING: The directory '{stitched_design_dir}' already exists.")
-            os.makedirs(stitched_design_dir)
+            if not os.path.exists(stitched_design_dir):
+                os.makedirs(stitched_design_dir)
 
         spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)

From 04ac0f438d5b9143955e023f2d09c9311673361e Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Thu, 16 Jan 2025 17:09:13 +0100
Subject: [PATCH 33/50] initial support for multigraph compilation in bridge
 file

---
 hls4ml/backends/vitis/vitis_backend.py        |  20 +-
 hls4ml/model/graph.py                         | 194 ++++++++++++++++--
 .../templates/vivado/build_lib_multigraph.sh  |  41 ++++
 .../vivado/myproject_bridge_multigraph.cpp    |  70 +++++++
 4 files changed, 296 insertions(+), 29 deletions(-)
 create mode 100644 hls4ml/templates/vivado/build_lib_multigraph.sh
 create mode 100644 hls4ml/templates/vivado/myproject_bridge_multigraph.cpp

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 9935ee011d..c66079a734 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -141,12 +141,8 @@ def build_stitched_design(
         graph_reports=None,
         simulation_input_data=None):  
 
-        OutputDir = nn_config['OutputDir']
-        VivadoProjectName = nn_config['VivadoProjectName']
-        OriginalProjectName = nn_config['OriginalProjectName']
-
-        os.makedirs(OutputDir, exist_ok=True)
-        stitched_design_dir = os.path.join(OutputDir, VivadoProjectName)
+        os.makedirs(nn_config['OutputDir'], exist_ok=True)
+        stitched_design_dir = os.path.join(nn_config['OutputDir'], nn_config['StitchedProjectName'])
         if stitch_design:
             if not os.path.exists(stitched_design_dir):
                 os.makedirs(stitched_design_dir)
@@ -163,7 +159,7 @@ def build_stitched_design(
         try:
             shutil.copy(ip_stitcher_path, stitched_design_dir)
         except Exception as e:
-            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {VivadoProjectName} folder.")
+            print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {nn_config['StitchedProjectName']} folder.")
 
         if nn_config:
             with open(nn_config_path, "w") as file:
@@ -183,7 +179,7 @@ def build_stitched_design(
                     data = simulation_input_data[i]
                     input_data_reshaped = prepare_testbench_input(data, layer['fifo_depth'], layer['batch_size'])
                 write_testbench_input(input_data_reshaped, testbench_input_path, layer['integer_bits'], layer['fractional_bits'])
-            print('Verilog testbench and its input data was generated.')
+            print('Verilog testbench and its input data were generated.')
 
         print('Running build process of stitched IP...\n')
         stitch_command = [
@@ -193,8 +189,8 @@ def build_stitched_design(
             f'stitch_design={int(stitch_design)}',
             f'sim_design={int(sim_stitched_design)}',
             f'export_design={int(export_stitched_design)}',
-            f'stitch_project_name={VivadoProjectName}',
-            f'original_project_name={OriginalProjectName}',
+            f"stitch_project_name={nn_config['StitchedProjectName']}",
+            f"original_project_name={nn_config['OriginalProjectName']}",
             f'sim_verilog_file=testbench.v'
         ]
         
@@ -209,8 +205,8 @@ def build_stitched_design(
             )
             process.communicate()
             if process.returncode != 0:
-                raise Exception(f'Stitching failed for {VivadoProjectName}. See logs for details.')
-        
+                raise Exception(f"Stitching failed for {nn_config['StitchedProjectName']}. See logs for details.")
+
         stitched_report = {'StitchedDesignReport': {}}
         if stitch_design:
             stitched_report = aggregate_graph_reports(graph_reports)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4bc1306b1b..4d067fb8d8 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -6,6 +6,7 @@
 import numpy as np
 import numpy.ctypeslib as npc
 import copy
+import stat
 import importlib.util
 import shutil
 import re
@@ -1025,10 +1026,13 @@ def __init__(self, graphs):
         self.config = copy.copy(self.graphs[0].config)
         self._deepcopy_config_names(self.graphs[0].config.config)
         self._initialize_config(graphs[0])
-        self.config.config['VivadoProjectName'] = 'vivado_stitched_design'
+        self.config.config['StitchedProjectName'] = 'vivado_stitched_design'
         self.backend = graphs[0].config.backend
         self.graph_reports = None
         self._top_function_lib = None
+        self.config.config['Stamp'] = '64616e'
+        self.inputs = graphs[0].inputs
+        self.outputs = graphs[-1].outputs
         self._compile = ModelGraph._compile.__get__(self, MultiModelGraph)
 
     def _initialize_config(self, first_graph):
@@ -1054,7 +1058,7 @@ def __getitem__(self, index):
     def parse_nn_config(self):
         nn_config = {"inputs": [], "outputs": []}
         nn_config['OutputDir'] = self.config.config['OutputDir']
-        nn_config['VivadoProjectName'] = self.config.config['VivadoProjectName']
+        nn_config['StitchedProjectName'] = self.config.config['StitchedProjectName']
         nn_config['OriginalProjectName'] = self.config.config['OriginalProjectName']
 
         # Parse layers (inputs and outputs)
@@ -1064,14 +1068,7 @@ def parse_nn_config(self):
                     total_bits = 1
                     [total_bits := total_bits * num for num in graph.output_vars[layer].shape]
                     pragma = graph.output_vars[layer].pragma
-                    if isinstance(pragma, str):
-                        layer_pragma = pragma  # 'reshape' or 'partition' pragma
-                        fifo_depth = 1
-                    elif isinstance(pragma, (list, tuple)) and len(pragma) == 2:
-                        layer_pragma = pragma[0]  # 'stream' pragma
-                        fifo_depth = pragma[1]
-                    else:
-                        raise ValueError(f"Unexpected format for pragma: {pragma}")
+                    layer_pragma, fifo_depth = self._get_pragma_details(pragma)
                     if total_bits % fifo_depth != 0:
                         raise ValueError(f"Division of total_bits by fifo_depth does not result in a remainder of zero.")
                     batch_size = total_bits // fifo_depth
@@ -1139,6 +1136,7 @@ def build_wrapper(g, **kwargs):
         self._replace_logos()
 
         if stitch_design or sim_stitched_design or export_stitched_design:
+            self._assert_consistent_pragmas()
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
                 stitch_design=stitch_design,
@@ -1153,7 +1151,9 @@ def build_wrapper(g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.compile()
-        #self._compile()
+        self.write_build_script()
+        self.write_bridge()
+        self._compile()
 
     def predict(self, x, sim = 'csim'):
         if sim == 'csim':
@@ -1184,7 +1184,139 @@ def trace(self, x):
             input_data = output_data
             trace_output.append(curr_trace_output)
         return output_data, trace_output
+    
+    def write_build_script(self):
+        # NOTE if we move this function to Vivado writer we need to pass graph objects 
+        spec = importlib.util.find_spec('hls4ml')
+        hls4ml_path = os.path.dirname(spec.origin)
+        build_lib_src = os.path.join(hls4ml_path, 'templates/vivado/build_lib_multigraph.sh') 
+        os.makedirs(self.config.config['OutputDir'], exist_ok=True)
+        build_lib_dst = os.path.join(self.config.config['OutputDir'], 'build_lib.sh') 
+        graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in self.graphs)
+        with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
+            for line in src.readlines():
+                line = line.replace('myproject', self.config.config['OriginalProjectName'])
+                line = line.replace('myproject_stitched', self.config.config['ProjectName'])
+                line = line.replace('mystamp', self.config.config['Stamp'])
+                line = line.replace('mygraph_name_list', graph_project_names)
+                dst.write(line)
+        os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
+    
+    def write_bridge(self):
+        # NOTE if we move this function to Vivado writer we need to pass graph objects 
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vivado/myproject_bridge_multigraph.cpp'))
+        fout = open(f"{self.config.get_output_dir()}/{self.config.config['ProjectName']}_bridge.cpp", 'w')        
+        model_inputs = self.graphs[0].get_input_variables()
+        model_outputs = self.graphs[-1].get_output_variables()
+        model_brams = [var for var in self.graphs[0].get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in f.readlines():
+            newline = ''
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(self.config.config['ProjectName'].upper()))
+            elif 'firmware/myproject' in line:
+                for graph_idx in range(len(self.graphs)):
+                    newline += line.replace('myproject', format(self.graphs[graph_idx].config.config['ProjectName']))
+                    newline += '\n#undef DEFINES_H_\n' if graph_idx < len(self.graphs)-1 else ''
+            elif 'myproject' in line:
+                newline = line.replace('myproject', format(self.graphs[0].config.config['ProjectName']))
+
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+            elif '// hls-fpga-machine-learning insert header' in line:
+                dtype = line.split('#', 1)[1].strip()
+                inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str + '\n'
+
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
+                dtype = line.split('#', 1)[1].strip()
+                newline = ''
+                for i in model_inputs:
+                    newline += indent + '{var};\n'.format(var=i.definition_cpp(name_suffix='_ap'))
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(
+                        dtype, i.type.name, i.size_cpp(), i.name, i.name
+                    )
+                newline += '\n'
+
+                for o in model_outputs:
+                    newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
+
+                newline += '\n'
+
+                input_vars = ','.join([i.name + '_ap' for i in model_inputs])
+                bram_vars = ','.join([b.name for b in model_brams])
+                output_vars = ','.join([o.name + '_ap' for o in model_outputs])
+
+                # Concatenate the input, output, and bram variables. Filter out empty/null values
+                all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                top_level = indent + f"//{self.config.config['ProjectName']}({all_vars});\n"
+                newline += top_level
+
+                newline += '\n'
+
+                for o in model_outputs:
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
+                        o.type.name, dtype, o.size_cpp(), o.name, o.name
+                    )
+
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                newline = ''
+                for layer in self.graphs[0].get_layers():
+                    func = layer.get_attr('function_cpp', None)
+                    if func and self.graphs[0].config.trace_output and layer.get_attr('trace', False):
+                        vars = layer.get_variables()
+                        for var in vars:
+                            newline += (
+                                indent
+                                + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                            )
+
+            elif '// hls-fpga-machine-learning insert namespace' in line:
+                newline = ''
+
+                namespace = self.config.get_writer_config().get('Namespace', None)
+                if namespace is not None:
+                    newline += indent + f'using namespace {namespace};\n'
+
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
         
+    def _get_pragma_details(self, pragma):
+        """
+        Extracts the pragma type and FIFO depth from the given pragma.
+        """
+        if isinstance(pragma, str):
+            pragma_str = pragma  # 'reshape' or 'partition' pragma
+            fifo_depth = 1
+        elif isinstance(pragma, (list, tuple)) and len(pragma) == 2:
+            pragma_str = pragma[0]  # 'stream' pragma
+            fifo_depth = pragma[1]
+        else:
+            raise ValueError(f"Unexpected format for pragma: {pragma}")
+        
+        return pragma_str, fifo_depth
+    
     def _print_status(self, status):
         print('\r', end='')
         status_icons = {
@@ -1196,6 +1328,34 @@ def _print_status(self, status):
         status_str = ' | '.join(f'{proj}: {status_icons.get(stat, "?")}' for proj, stat in status.items())
         print(status_str, flush=True)
 
+    def _assert_consistent_pragmas(self):
+        """
+        Ensure all graphs have the same pragma in their input and output layers.
+        Stitching and simulating mixed pragmas is not supported at the moment.
+        """
+        ref_pragmas = set(
+            self._get_pragma_details(self.graphs[0].output_vars[layer].pragma)[0] 
+            for layer in self.graphs[0].inputs + self.graphs[0].outputs
+            if layer in self.graphs[0].output_vars
+        )
+
+        if len(ref_pragmas) != 1:
+            raise ValueError(f"Multiple pragmas detected in 1st graph: {ref_pragmas} ")
+    
+        for idx, g in enumerate(self.graphs[1:], start=1):
+            current_pragmas = set(
+                self._get_pragma_details(g.output_vars[layer].pragma)[0] 
+                for layer in g.inputs + g.outputs
+                if layer in g.output_vars
+            )
+
+            if ref_pragmas != current_pragmas:
+                raise ValueError(
+                    f"Pragma mismatch in graph {idx}:\n"
+                    f"Expected: {ref_pragmas}\n"
+                    f"Found: {current_pragmas}"
+                )
+
     def _replace_logos(self):
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
@@ -1204,16 +1364,16 @@ def _replace_logos(self):
         if not os.path.isfile(hls4ml_logo):
             raise FileNotFoundError(f"hls4ml logo not found at: {hls4ml_logo}")
 
-        for graph in self.graphs:
+        for g in self.graphs:
             graph_logo_paths = [
                 os.path.join(
-                    graph.config.get_output_dir(),
-                    graph.config.get_project_name() + '_prj',
+                    g.config.get_output_dir(),
+                    g.config.get_project_name() + '_prj',
                     'solution1/impl/misc/logo.png'
                 ),
                 os.path.join(
-                    graph.config.get_output_dir(),
-                    graph.config.get_project_name() + '_prj',
+                    g.config.get_output_dir(),
+                    g.config.get_project_name() + '_prj',
                     'solution1/impl/ip/misc/logo.png'
                 )
             ]
@@ -1221,4 +1381,4 @@ def _replace_logos(self):
                 for logo in graph_logo_paths:
                     shutil.copy(hls4ml_logo, logo)
             except Exception as e:
-                print(f"Error copying hls4ml logo to {graph.config.get_output_dir()} project: {e}")
+                print(f"Error copying hls4ml logo to {g.config.get_output_dir()} project: {e}")
diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
new file mode 100644
index 0000000000..4621a2d2f7
--- /dev/null
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+set -e
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+
+graph_project_names=(mygraph_name_list)
+
+LDFLAGS=
+ORIGINAL_PROJECT=myproject
+PROJECT=myproject_stitched
+LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && cd .. && pwd)"
+AP_TYPES_PATH="-I${BASEDIR}/${graph_project_names[0]}/firmware/ap_types/"
+INCFLAGS=""
+OUTPUT_DIR="${BASEDIR}/stitched/firmware"
+
+mkdir -p "${OUTPUT_DIR}"
+
+# Compile all graphs
+OBJECT_FILES=()
+for g in "${graph_project_names[@]}"; do
+    WEIGHTS_DIR="\"${BASEDIR}/${g}/firmware/weights\""
+    SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
+    OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
+    
+    ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
+    OBJECT_FILES+=("${OBJ_FILE}")
+    INCFLAGS+="-I${BASEDIR}/${g}/ "
+done
+
+${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -c "${PROJECT}_bridge.cpp" -o ${PROJECT}_bridge.o
+
+${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -shared "${OBJECT_FILES[@]}" ${PROJECT}_bridge.o -o "${OUTPUT_DIR}/${PROJECT}-${LIB_STAMP}.so"
+
+rm -f "${OBJECT_FILES[@]}"
+rm -f ${PROJECT}_bridge.o
diff --git a/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp b/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp
new file mode 100644
index 0000000000..edd75a4246
--- /dev/null
+++ b/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp
@@ -0,0 +1,70 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/myproject.h"
+
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif

From 10e95a8b7cd7e70b744d06b8fb357a7369603e50 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 17 Jan 2025 14:24:36 +0100
Subject: [PATCH 34/50] stitched report fix for VivadoSynth aggregate

---
 hls4ml/report/vivado_report.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index 43e783f753..ec4182dcb4 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -684,24 +684,29 @@ def aggregate_graph_reports(graph_reports):
 
     keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM']
     first_subgraph = next(iter(graph_reports))
-    base_report = graph_reports[first_subgraph]['CSynthesisReport']
+    reportChoice = (
+        'CSynthesisReport' if 'VivadoSynthReport' not in graph_reports[first_subgraph]
+        else 'VivadoSynthReport'
+    )
+    base_report = graph_reports[first_subgraph][reportChoice]
+    csynth_report = graph_reports[first_subgraph].get('CSynthesisReport', base_report)
 
     final_report = {
-        'TargetClockPeriod': base_report.get('TargetClockPeriod', 'N/A'),
-        'EstimatedClockPeriod': float(base_report.get('EstimatedClockPeriod', float('inf'))),
+        'TargetClockPeriod': csynth_report.get('TargetClockPeriod', 'N/A'),
+        'EstimatedClockPeriod': float(csynth_report.get('EstimatedClockPeriod', float('inf'))),
         'BestLatency': 'N/A',
         'WorstLatency': 'N/A'
     }
 
+    final_report['AvailableBRAM_18K'] = csynth_report.get('AvailableBRAM_18K', 'N/A')
+    final_report['AvailableDSP'] = csynth_report.get('AvailableDSP', 'N/A')
+    final_report['AvailableFF'] = csynth_report.get('AvailableFF', 'N/A')
+    final_report['AvailableLUT'] = csynth_report.get('AvailableLUT', 'N/A')
+    final_report['AvailableURAM'] = csynth_report.get('AvailableURAM', 'N/A')
+
     for k in keys_to_sum:
         final_report[k] = int(base_report.get(k, '0'))
 
-    final_report['AvailableBRAM_18K'] = base_report.get('AvailableBRAM_18K', 'N/A')
-    final_report['AvailableDSP'] = base_report.get('AvailableDSP', 'N/A')
-    final_report['AvailableFF'] = base_report.get('AvailableFF', 'N/A')
-    final_report['AvailableLUT'] = base_report.get('AvailableLUT', 'N/A')
-    final_report['AvailableURAM'] = base_report.get('AvailableURAM', 'N/A')
-
     for subgraph, data in graph_reports.items():
         if subgraph == first_subgraph:
             continue
@@ -712,6 +717,8 @@ def aggregate_graph_reports(graph_reports):
 
         for k in keys_to_sum:
             final_report[k] += int(report.get(k, '0'))
+            if k == 'DSP':
+                final_report[k] += int(report.get('DSP48E', '0'))
 
     final_report['EstimatedClockPeriod'] = f"{final_report['EstimatedClockPeriod']:.3f}"
     for k in keys_to_sum:

From 8c5a13b57efd3d319cc8b84dc5ac5995476661e4 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 21 Jan 2025 17:26:59 +0100
Subject: [PATCH 35/50] use log_to_stdout flag for parallel builds

---
 docs/ir/multimodelgraph.rst            |  4 +--
 hls4ml/backends/vitis/vitis_backend.py | 18 ++++++++-----
 hls4ml/model/graph.py                  | 37 +++++++++++++-------------
 3 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
index b92cc28035..d14cc4ec2c 100644
--- a/docs/ir/multimodelgraph.rst
+++ b/docs/ir/multimodelgraph.rst
@@ -110,7 +110,7 @@ Performs a forward pass through the chained sub-models using the C-simulation (`
 
 .. _mmg-trace-method:
 
-``trace`` method
+``trace`` method [TODO]
 ================
 
 Provides detailed layer-by-layer outputs across all sub-models, which is essential for debugging or tuning quantization and precision settings.
@@ -125,4 +125,4 @@ Provides detailed layer-by-layer outputs across all sub-models, which is essenti
 Summary
 --------------------------
 
-The ``MultiModelGraph`` class is a powerful tool for modular hardware design. By splitting a large neural network into multiple subgraphs, building each independently, and then stitching them together, you gain flexibility, parallelism, and a clear path to advanced workflows such as hierarchical design, incremental optimization, and integrated system-level simulations.
+The ``MultiModelGraph`` class is a tool for modular hardware design. By splitting a large neural network into multiple subgraphs, building each independently, and then stitching them together, you gain flexibility, parallelism, and facilitate hierarchical design, incremental optimization, and integrated system-level simulations.
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index c66079a734..6ff8001e7e 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -99,6 +99,7 @@ def build(
         export=False,
         vsynth=False,
         fifo_opt=False,
+        log_to_stdout=True,
     ):
         if 'linux' in sys.platform:
             found = os.system('command -v vitis_hls > /dev/null')
@@ -111,24 +112,29 @@ def build(
         ).format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth, fifo_opt=fifo_opt)
 
         output_dir = model.config.get_output_dir()
-        # Define log file paths
-        # NOTE - 'build_stdout.log' is the same as 'vitis_hls.log'
         stdout_log = os.path.join(output_dir, 'build_stdout.log')
         stderr_log = os.path.join(output_dir, 'build_stderr.log')
         
-        with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
-            # Use subprocess.Popen to capture output
+        stdout_target = None if log_to_stdout else open(stdout_log, 'w')
+        stderr_target = None if log_to_stdout else open(stderr_log, 'w')
+
+        try:
             process = subprocess.Popen(
                 build_command,
                 shell=True,
                 cwd=output_dir,
-                stdout=stdout_file,
-                stderr=stderr_file,
+                stdout=stdout_target,
+                stderr=stderr_target,
                 text=True
             )
             process.communicate()
+
             if process.returncode != 0:
                 raise Exception(f'Build failed for {model.config.get_project_name()}. See logs for details.')
+        finally:
+            if not log_to_stdout:
+                stdout_target.close()
+                stderr_target.close()
 
         return parse_vivado_report(output_dir)
     
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4d067fb8d8..30e65bba5e 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1096,41 +1096,42 @@ def build(self, stitch_design=False, sim_stitched_design=False, export_stitched_
         if (sim_stitched_design or export_stitched_design) and not stitch_design:
             raise ValueError("You can't simulate or export a stitched design without enabling stitch_design.")
         build_results = {}
-        total_builds = len(self.graphs)
         status = {}
         status_lock = threading.Lock()
 
-        for g in self.graphs:
-            project_name = g.config.get_project_name()
-            status[project_name] = 'Pending'
+        for idx, g in enumerate(self.graphs, start=1):
+            status[f'graph{idx}'] = 'Pending'
 
-        def build_wrapper(g, **kwargs):
-            project_name = g.config.get_project_name()
+        def build_wrapper(idx, g, **kwargs):
+            graph_name = f'graph{idx}'
             with status_lock:
-                status[project_name] = 'Running'
+                status[graph_name] = 'Running'
                 self._print_status(status)
             try:
-                result = g.build(**kwargs)
+                result = g.build(log_to_stdout = False, **kwargs)
                 with status_lock:
-                    status[project_name] = 'Completed'
+                    status[graph_name] = 'Completed'
                     self._print_status(status)
                 return result
             except Exception as exc:
                 with status_lock:
-                    status[project_name] = 'Failed'
+                    status[graph_name] = 'Failed'
                     self._print_status(status)
                 raise
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-            future_to_g = {executor.submit(build_wrapper, g, **kwargs): g for g in self.graphs}
-            for future in concurrent.futures.as_completed(future_to_g):
-                g = future_to_g[future]
-                project_name = g.config.get_project_name()
+            future_to_idx = {
+            executor.submit(build_wrapper, idx, g, **kwargs): idx
+            for idx, g in enumerate(self.graphs, start=1)
+            }
+            for future in concurrent.futures.as_completed(future_to_idx):
+                idx = future_to_idx[future]
+                graph_name = f'graph{idx}'
                 try:
                     result = future.result()
-                    build_results[project_name] = result
+                    build_results[graph_name] = result
                 except Exception as exc:
-                    build_results[project_name] = None
+                    build_results[graph_name] = None
 
         self.graph_reports=build_results
         self._replace_logos()
@@ -1186,7 +1187,7 @@ def trace(self, x):
         return output_data, trace_output
     
     def write_build_script(self):
-        # NOTE if we move this function to Vivado writer we need to pass graph objects 
+        # NOTE we need to move this function to Vivado writer with each graph object 
         spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)
         build_lib_src = os.path.join(hls4ml_path, 'templates/vivado/build_lib_multigraph.sh') 
@@ -1203,7 +1204,7 @@ def write_build_script(self):
         os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
     
     def write_bridge(self):
-        # NOTE if we move this function to Vivado writer we need to pass graph objects 
+        # NOTE we need to move this function to Vivado writer with each graph object 
         """Write the Python-C++ bridge (myproject_bridge.cpp)
         Args:
             model (ModelGraph): the hls4ml model.

From 4a7e6c374313edf65995d30d3f338336cd889d99 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 24 Jan 2025 15:45:09 +0100
Subject: [PATCH 36/50] small change

---
 hls4ml/backends/vivado/passes/transform_types.py | 1 +
 hls4ml/model/graph.py                            | 9 ++++++---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transform_types.py b/hls4ml/backends/vivado/passes/transform_types.py
index 7bff3b8efc..c2ab6f73c0 100644
--- a/hls4ml/backends/vivado/passes/transform_types.py
+++ b/hls4ml/backends/vivado/passes/transform_types.py
@@ -31,6 +31,7 @@ def transform(self, model, node):
                 new_var = self.array_var_converter.convert(var, pragma='stream')
             elif io_type == 'io_parallel':
                 if out_name in node.model.inputs:
+                    # NOTE this needs to be changed to partition
                     new_var = self.array_var_converter.convert(var, pragma='reshape')
                 elif isinstance(var, InplaceTensorVariable):
                     new_var = self.inplace_array_var_converter.convert(var, pragma='')
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 30e65bba5e..563b25d5bf 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1310,7 +1310,7 @@ def _get_pragma_details(self, pragma):
         if isinstance(pragma, str):
             pragma_str = pragma  # 'reshape' or 'partition' pragma
             fifo_depth = 1
-        elif isinstance(pragma, (list, tuple)) and len(pragma) == 2:
+        elif isinstance(pragma, (list, tuple)):
             pragma_str = pragma[0]  # 'stream' pragma
             fifo_depth = pragma[1]
         else:
@@ -1341,8 +1341,11 @@ def _assert_consistent_pragmas(self):
         )
 
         if len(ref_pragmas) != 1:
-            raise ValueError(f"Multiple pragmas detected in 1st graph: {ref_pragmas} ")
-    
+            raise ValueError(
+                f"Multiple pragmas detected in 1st graph: {ref_pragmas}. "
+                "Ensure all graphs have the same interface (stream or partition)."
+            )
+                
         for idx, g in enumerate(self.graphs[1:], start=1):
             current_pragmas = set(
                 self._get_pragma_details(g.output_vars[layer].pragma)[0] 

From d6c19d517f535890b78d5c41ee4c39cd99e9364d Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 24 Jan 2025 16:39:05 +0100
Subject: [PATCH 37/50] remove bridged multigraph compilation for now

---
 hls4ml/model/graph.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 563b25d5bf..ac72cd9444 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1152,9 +1152,10 @@ def build_wrapper(idx, g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.compile()
-        self.write_build_script()
-        self.write_bridge()
-        self._compile()
+        # TODO  
+        #self.write_build_script()
+        #self.write_bridge()
+        #self._compile()
 
     def predict(self, x, sim = 'csim'):
         if sim == 'csim':

From 022584569459ad5e24359337c7f48f10d45d5cf2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 24 Jan 2025 16:42:14 +0000
Subject: [PATCH 38/50] [pre-commit.ci] auto fixes from pre-commit hooks

---
 docs/ir/multimodelgraph.rst                   |  16 +-
 hls4ml/backends/vitis/vitis_backend.py        |  63 +++---
 hls4ml/converters/keras_to_hls.py             |   8 +-
 hls4ml/model/graph.py                         | 179 +++++++++---------
 hls4ml/report/vivado_report.py                |   9 +-
 .../templates/vivado/build_lib_multigraph.sh  |   2 +-
 hls4ml/templates/vivado/ip_stitcher.tcl       |  22 +--
 hls4ml/utils/simulation_utils.py              |  96 +++++-----
 8 files changed, 195 insertions(+), 200 deletions(-)

diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
index d14cc4ec2c..4cb0bfc726 100644
--- a/docs/ir/multimodelgraph.rst
+++ b/docs/ir/multimodelgraph.rst
@@ -2,8 +2,8 @@
 MultiModelGraph Class
 =======================
 
-This page documents the ``MultiModelGraph`` class, which enables handling multiple subgraphs (each represented as a ``ModelGraph``) derived from a single original model. 
-The central concept here is the division of a larger model into multiple smaller subgraphs at given layers which can be useful for: 
+This page documents the ``MultiModelGraph`` class, which enables handling multiple subgraphs (each represented as a ``ModelGraph``) derived from a single original model.
+The central concept here is the division of a larger model into multiple smaller subgraphs at given layers which can be useful for:
 
 * Very large models
 * Step-wise optimization
@@ -26,8 +26,8 @@ For example, when converting a Keras model, you can specify the layers at which
    config = hls4ml.utils.config_from_keras_model(model, granularity='model')
 
    hls_model = hls4ml.converters.convert_from_keras_model(
-       model, 
-       hls_config=config, 
+       model,
+       hls_config=config,
        backend='vitis',
        split_layer_names = ['layer3', 'layer7']
    )
@@ -39,10 +39,10 @@ Here, the ``hls_model`` is actually a ``MultiModelGraph`` containing three subgr
 Key Methods for MultiModelGraph
 ----------------------------------
 
-* :ref:`compile <mmg-compile-method>`  
-* :ref:`predict <mmg-predict-method>`  
-* :ref:`build <mmg-build-method>`  
-* :ref:`trace <mmg-trace-method>`  
+* :ref:`compile <mmg-compile-method>`
+* :ref:`predict <mmg-predict-method>`
+* :ref:`build <mmg-build-method>`
+* :ref:`trace <mmg-trace-method>`
 * :ref:`make_multi_graph <make_multi_graph-method>`
 
 ----
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 6ff8001e7e..d0913e78c1 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -1,14 +1,20 @@
-import os
-import sys
-import subprocess
 import importlib.util
 import json
+import os
 import shutil
+import subprocess
+import sys
 
 from hls4ml.backends import VivadoBackend
 from hls4ml.model.flow import get_flow, register_flow
-from hls4ml.report import parse_vivado_report, aggregate_graph_reports
-from hls4ml.utils.simulation_utils import write_verilog_testbench, read_testbench_log, write_testbench_input, prepare_testbench_input, prepare_zero_input
+from hls4ml.report import aggregate_graph_reports, parse_vivado_report
+from hls4ml.utils.simulation_utils import (
+    prepare_testbench_input,
+    prepare_zero_input,
+    read_testbench_log,
+    write_testbench_input,
+    write_verilog_testbench,
+)
 
 
 class VitisBackend(VivadoBackend):
@@ -114,18 +120,13 @@ def build(
         output_dir = model.config.get_output_dir()
         stdout_log = os.path.join(output_dir, 'build_stdout.log')
         stderr_log = os.path.join(output_dir, 'build_stderr.log')
-        
+
         stdout_target = None if log_to_stdout else open(stdout_log, 'w')
         stderr_target = None if log_to_stdout else open(stderr_log, 'w')
 
         try:
             process = subprocess.Popen(
-                build_command,
-                shell=True,
-                cwd=output_dir,
-                stdout=stdout_target,
-                stderr=stderr_target,
-                text=True
+                build_command, shell=True, cwd=output_dir, stdout=stdout_target, stderr=stderr_target, text=True
             )
             process.communicate()
 
@@ -137,7 +138,7 @@ def build(
                 stderr_target.close()
 
         return parse_vivado_report(output_dir)
-    
+
     def build_stitched_design(
         self,
         stitch_design=True,
@@ -145,7 +146,8 @@ def build_stitched_design(
         export_stitched_design=False,
         nn_config=None,
         graph_reports=None,
-        simulation_input_data=None):  
+        simulation_input_data=None,
+    ):
 
         os.makedirs(nn_config['OutputDir'], exist_ok=True)
         stitched_design_dir = os.path.join(nn_config['OutputDir'], nn_config['StitchedProjectName'])
@@ -155,11 +157,11 @@ def build_stitched_design(
 
         spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)
-        ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')     
+        ip_stitcher_path = os.path.join(hls4ml_path, 'templates/vivado/ip_stitcher.tcl')
         stdout_log = os.path.join(stitched_design_dir, 'stitcher_stdout.log')
         stderr_log = os.path.join(stitched_design_dir, 'stitcher_stderr.log')
         nn_config_path = os.path.join(stitched_design_dir, 'nn_config.json')
-        testbench_path =  os.path.join(stitched_design_dir, 'testbench.v')
+        testbench_path = os.path.join(stitched_design_dir, 'testbench.v')
         testbench_log_path = os.path.join(stitched_design_dir, 'testbench_log.csv')
 
         try:
@@ -170,8 +172,8 @@ def build_stitched_design(
         if nn_config:
             with open(nn_config_path, "w") as file:
                 json.dump(nn_config, file, indent=4)
-        
-        if(sim_stitched_design):
+
+        if sim_stitched_design:
             write_verilog_testbench(nn_config, testbench_path)
             # Produce a testbench input file for every input layer
             for i, layer in enumerate(nn_config['inputs']):
@@ -184,30 +186,33 @@ def build_stitched_design(
                     # Handles both single and multi-layer cases. First dim should always be batch size
                     data = simulation_input_data[i]
                     input_data_reshaped = prepare_testbench_input(data, layer['fifo_depth'], layer['batch_size'])
-                write_testbench_input(input_data_reshaped, testbench_input_path, layer['integer_bits'], layer['fractional_bits'])
+                write_testbench_input(
+                    input_data_reshaped, testbench_input_path, layer['integer_bits'], layer['fractional_bits']
+                )
             print('Verilog testbench and its input data were generated.')
 
         print('Running build process of stitched IP...\n')
         stitch_command = [
-            'vivado', '-mode', 'batch', '-nojournal', '-nolog', '-notrace',
-            '-source', ip_stitcher_path,
+            'vivado',
+            '-mode',
+            'batch',
+            '-nojournal',
+            '-nolog',
+            '-notrace',
+            '-source',
+            ip_stitcher_path,
             '-tclargs',
             f'stitch_design={int(stitch_design)}',
             f'sim_design={int(sim_stitched_design)}',
             f'export_design={int(export_stitched_design)}',
             f"stitch_project_name={nn_config['StitchedProjectName']}",
             f"original_project_name={nn_config['OriginalProjectName']}",
-            f'sim_verilog_file=testbench.v'
+            f'sim_verilog_file=testbench.v',
         ]
-        
+
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
             process = subprocess.Popen(
-                stitch_command,
-                cwd=stitched_design_dir,
-                stdout=stdout_file,
-                stderr=stderr_file,
-                text=True,
-                shell=False
+                stitch_command, cwd=stitched_design_dir, stdout=stdout_file, stderr=stderr_file, text=True, shell=False
             )
             process.communicate()
             if process.returncode != 0:
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index bdc68da563..7a6bd9de28 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -322,16 +322,18 @@ def parse_keras_model(model_arch, reader):
     return layer_list, input_layers, output_layers, output_shapes
 
 
-def keras_to_hls(config, split_layer_names = None):
+def keras_to_hls(config, split_layer_names=None):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, output_shapes = parse_keras_model(model_arch, reader)
-    
+
     print('Creating HLS model...')
     merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     if split_layer_names:
         if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
             raise ValueError(f'Split layer must not be a merge layer')
-        hls_model = ModelGraph.make_multi_graph(config, layer_list, input_layers, output_layers, output_shapes, split_layer_names)
+        hls_model = ModelGraph.make_multi_graph(
+            config, layer_list, input_layers, output_layers, output_shapes, split_layer_names
+        )
         print('Multi-graph HLS model created.')
     else:
         hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index ac72cd9444..aa2bf815a2 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1,18 +1,18 @@
+import concurrent.futures
+import copy
 import ctypes
+import importlib.util
 import os
 import platform
+import re
+import shutil
+import stat
+import threading
+import warnings
 from collections import OrderedDict
 
 import numpy as np
 import numpy.ctypeslib as npc
-import copy
-import stat
-import importlib.util
-import shutil
-import re
-import warnings
-import concurrent.futures
-import threading
 
 from hls4ml.backends import get_backend
 from hls4ml.model.flow import get_flow
@@ -20,6 +20,7 @@
 from hls4ml.model.optimizer import get_available_passes, optimize_model
 from hls4ml.utils.string_utils import convert_to_snake_case
 
+
 class HLSConfig:
     """The configuration class as stored in the ModelGraph.
 
@@ -944,7 +945,7 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
         current_index = 0
         last_output_precision = None
         for idx, sub_layer_list in enumerate(subgraph_layer_lists):
-            
+
             # Create a shallow copy of the config for each subgraph
             sub_config = copy.copy(config)
             sub_config['OutputDir'] = os.path.join(original_OutputDir, f'graph{idx + 1}')
@@ -957,10 +958,10 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
                 previous_layer = layer_list[previous_layer_index]
                 previous_layer_name = previous_layer['name']
                 input_shape = output_shapes.get(previous_layer_name, None)
-                #NOTE - Verify that the input shape is correctly identified
+                # NOTE - Verify that the input shape is correctly identified
                 if input_shape is None:
                     raise ValueError(f"Could not find input_shape of '{split_layer_names[idx - 1]}'.")
-                
+
                 current_split_layer = sub_layer_list[0]
                 input_layer_name = current_split_layer['name'] + '_input'
                 input_layer_dict = {
@@ -970,51 +971,49 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
                     'input_shape': input_shape[1:],
                 }
                 # Reset the inputs of the split layer in the current graph
-                #NOTE - Better allow it to automatically determine its inputs
+                # NOTE - Better allow it to automatically determine its inputs
                 sub_layer_list[0]['inputs'] = []
                 # Then insert the new input layer at the beginning
                 sub_layer_list.insert(0, input_layer_dict)
 
                 # Copy 'Precision' and 'Trace' from the previous layer's config to the new input layer's config
-                if 'LayerName' in sub_config['HLSConfig']:    
+                if 'LayerName' in sub_config['HLSConfig']:
                     if previous_layer_name in sub_config['HLSConfig']['LayerName']:
                         prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
                         new_layer_config = {}
                         new_layer_config['Precision'] = prev_layer_config['Precision']
-                        #NOTE - We copy Trace as well but it might be better to reset it
-                        new_layer_config['Trace'] = prev_layer_config['Trace'] 
-                        # copy last layer config from previous graph to the new input layer config of current graph 
+                        # NOTE - We copy Trace as well but it might be better to reset it
+                        new_layer_config['Trace'] = prev_layer_config['Trace']
+                        # copy last layer config from previous graph to the new input layer config of current graph
                         sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
                     else:
                         raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
                 else:
-                    pass # case of granularity='Model'
-            
+                    pass  # case of granularity='Model'
+
             graph_output_layers = output_layers if idx == len(subgraph_layer_lists) - 1 else None
             graph_input_layers = input_layers if idx == 0 else None
-            hls_model = ModelGraph(sub_config, sub_layer_list, 
-                                   graph_input_layers, 
-                                   graph_output_layers, 
-                                   initial_index=current_index)
+            hls_model = ModelGraph(
+                sub_config, sub_layer_list, graph_input_layers, graph_output_layers, initial_index=current_index
+            )
 
-            # After creating subgraph, get the precision from the last layer's output. 
+            # After creating subgraph, get the precision from the last layer's output.
             if hls_model.graph:
                 try:
                     last_layer = next(reversed(hls_model.graph.values()))
                     last_output_precision = last_layer.attributes['precision']['result']
                 except (KeyError, AttributeError):
                     warnings.warn(
-                    "Could not find precision in the last layer. "
-                    "Setting 'last_output_precision' to 'auto'."
+                        "Could not find precision in the last layer. " "Setting 'last_output_precision' to 'auto'."
                     )
-                    last_output_precision = 'auto'  
+                    last_output_precision = 'auto'
 
             # Update the current index for the next graph
             # Get the index of the last element in the graph
             layer_indices = [layer.index for layer in hls_model.graph.values()]
             if layer_indices:
                 max_index = max(layer_indices)
-                current_index = max_index - 1 # we have the new input layer as well
+                current_index = max_index - 1  # we have the new input layer as well
             model_graphs.append(hls_model)
 
         return MultiModelGraph(model_graphs)
@@ -1042,19 +1041,17 @@ def _initialize_config(self, first_graph):
         original_project_name = first_graph.config.get_project_name().partition('_graph')[0]
         self.config.config['ProjectName'] = f"{original_project_name}_stitched"
         self.config.config['OriginalProjectName'] = original_project_name
-        original_output_dir = first_graph.config.get_output_dir().partition('/graph')[0]       
+        original_output_dir = first_graph.config.get_output_dir().partition('/graph')[0]
         self.config.config['OutputDir'] = os.path.join(original_output_dir, 'stitched')
 
     def _deepcopy_config_names(self, config):
         # Deep copy only 'ProjectName' and 'OutputDir', shallow copy others
         keys_to_deepcopy = ['ProjectName', 'OutputDir']
-        self.config.config = {k: copy.deepcopy(config[k]) 
-                            if k in keys_to_deepcopy 
-                            else config[k] for k in config}
+        self.config.config = {k: copy.deepcopy(config[k]) if k in keys_to_deepcopy else config[k] for k in config}
 
     def __getitem__(self, index):
         return self.graphs[index]
-    
+
     def parse_nn_config(self):
         nn_config = {"inputs": [], "outputs": []}
         nn_config['OutputDir'] = self.config.config['OutputDir']
@@ -1073,26 +1070,32 @@ def parse_nn_config(self):
                         raise ValueError(f"Division of total_bits by fifo_depth does not result in a remainder of zero.")
                     batch_size = total_bits // fifo_depth
                     precision = graph.output_vars[layer].type.precision
-                    nn_config[io_type].append({
-                        "name": graph.output_vars[layer].name,
-                        "pragma": layer_pragma,
-                        "integer_bits": int(precision.integer),
-                        "fractional_bits": int(precision.fractional),
-                        "signed": int(precision.signed),
-                        "fifo_depth": int(fifo_depth),
-                        "batch_size": int(batch_size)
-                    })
-
-        return nn_config          
-
-    def build(self, stitch_design=False, sim_stitched_design=False, export_stitched_design=False, max_workers=None, **kwargs):
+                    nn_config[io_type].append(
+                        {
+                            "name": graph.output_vars[layer].name,
+                            "pragma": layer_pragma,
+                            "integer_bits": int(precision.integer),
+                            "fractional_bits": int(precision.fractional),
+                            "signed": int(precision.signed),
+                            "fifo_depth": int(fifo_depth),
+                            "batch_size": int(batch_size),
+                        }
+                    )
+
+        return nn_config
+
+    def build(
+        self, stitch_design=False, sim_stitched_design=False, export_stitched_design=False, max_workers=None, **kwargs
+    ):
         """
         Builds all ModelGraph instances in parallel, with optional stitching and export.
         """
 
         export = kwargs.get('export', False)
         if (stitch_design or sim_stitched_design or export_stitched_design) and not export:
-            raise ValueError("You can't enable stitch_design, sim_stitched_design, or export_stitched_design without having export=True.")
+            raise ValueError(
+                "You can't enable stitch_design, sim_stitched_design, or export_stitched_design without having export=True."
+            )
         if (sim_stitched_design or export_stitched_design) and not stitch_design:
             raise ValueError("You can't simulate or export a stitched design without enabling stitch_design.")
         build_results = {}
@@ -1108,7 +1111,7 @@ def build_wrapper(idx, g, **kwargs):
                 status[graph_name] = 'Running'
                 self._print_status(status)
             try:
-                result = g.build(log_to_stdout = False, **kwargs)
+                result = g.build(log_to_stdout=False, **kwargs)
                 with status_lock:
                     status[graph_name] = 'Completed'
                     self._print_status(status)
@@ -1121,8 +1124,7 @@ def build_wrapper(idx, g, **kwargs):
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             future_to_idx = {
-            executor.submit(build_wrapper, idx, g, **kwargs): idx
-            for idx, g in enumerate(self.graphs, start=1)
+                executor.submit(build_wrapper, idx, g, **kwargs): idx for idx, g in enumerate(self.graphs, start=1)
             }
             for future in concurrent.futures.as_completed(future_to_idx):
                 idx = future_to_idx[future]
@@ -1133,7 +1135,7 @@ def build_wrapper(idx, g, **kwargs):
                 except Exception as exc:
                     build_results[graph_name] = None
 
-        self.graph_reports=build_results
+        self.graph_reports = build_results
         self._replace_logos()
 
         if stitch_design or sim_stitched_design or export_stitched_design:
@@ -1144,7 +1146,8 @@ def build_wrapper(idx, g, **kwargs):
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
                 nn_config=nn_config,
-                graph_reports=self.graph_reports)
+                graph_reports=self.graph_reports,
+            )
             return stitched_report
 
         return self.graph_reports
@@ -1152,12 +1155,12 @@ def build_wrapper(idx, g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.compile()
-        # TODO  
-        #self.write_build_script()
-        #self.write_bridge()
-        #self._compile()
+        # TODO
+        # self.write_build_script()
+        # self.write_bridge()
+        # self._compile()
 
-    def predict(self, x, sim = 'csim'):
+    def predict(self, x, sim='csim'):
         if sim == 'csim':
             input_data = x
             for g in self.graphs:
@@ -1172,11 +1175,12 @@ def predict(self, x, sim = 'csim'):
                 export_stitched_design=False,
                 nn_config=nn_config,
                 graph_reports=self.graph_reports,
-                simulation_input_data=x)
+                simulation_input_data=x,
+            )
             return stitched_report['BehavSimResults']
-        else: 
+        else:
             print('Unknown simulation option given.')
-            
+
     def trace(self, x):
         # TODO: finish trace function
         input_data = x
@@ -1186,14 +1190,14 @@ def trace(self, x):
             input_data = output_data
             trace_output.append(curr_trace_output)
         return output_data, trace_output
-    
+
     def write_build_script(self):
-        # NOTE we need to move this function to Vivado writer with each graph object 
+        # NOTE we need to move this function to Vivado writer with each graph object
         spec = importlib.util.find_spec('hls4ml')
         hls4ml_path = os.path.dirname(spec.origin)
-        build_lib_src = os.path.join(hls4ml_path, 'templates/vivado/build_lib_multigraph.sh') 
+        build_lib_src = os.path.join(hls4ml_path, 'templates/vivado/build_lib_multigraph.sh')
         os.makedirs(self.config.config['OutputDir'], exist_ok=True)
-        build_lib_dst = os.path.join(self.config.config['OutputDir'], 'build_lib.sh') 
+        build_lib_dst = os.path.join(self.config.config['OutputDir'], 'build_lib.sh')
         graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in self.graphs)
         with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
             for line in src.readlines():
@@ -1203,9 +1207,9 @@ def write_build_script(self):
                 line = line.replace('mygraph_name_list', graph_project_names)
                 dst.write(line)
         os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
-    
+
     def write_bridge(self):
-        # NOTE we need to move this function to Vivado writer with each graph object 
+        # NOTE we need to move this function to Vivado writer with each graph object
         """Write the Python-C++ bridge (myproject_bridge.cpp)
         Args:
             model (ModelGraph): the hls4ml model.
@@ -1213,7 +1217,7 @@ def write_bridge(self):
 
         filedir = os.path.dirname(os.path.abspath(__file__))
         f = open(os.path.join(filedir, '../templates/vivado/myproject_bridge_multigraph.cpp'))
-        fout = open(f"{self.config.get_output_dir()}/{self.config.config['ProjectName']}_bridge.cpp", 'w')        
+        fout = open(f"{self.config.get_output_dir()}/{self.config.config['ProjectName']}_bridge.cpp", 'w')
         model_inputs = self.graphs[0].get_input_variables()
         model_outputs = self.graphs[-1].get_output_variables()
         model_brams = [var for var in self.graphs[0].get_weight_variables() if var.storage.lower() == 'bram']
@@ -1227,7 +1231,7 @@ def write_bridge(self):
             elif 'firmware/myproject' in line:
                 for graph_idx in range(len(self.graphs)):
                     newline += line.replace('myproject', format(self.graphs[graph_idx].config.config['ProjectName']))
-                    newline += '\n#undef DEFINES_H_\n' if graph_idx < len(self.graphs)-1 else ''
+                    newline += '\n#undef DEFINES_H_\n' if graph_idx < len(self.graphs) - 1 else ''
             elif 'myproject' in line:
                 newline = line.replace('myproject', format(self.graphs[0].config.config['ProjectName']))
 
@@ -1303,7 +1307,7 @@ def write_bridge(self):
 
         f.close()
         fout.close()
-        
+
     def _get_pragma_details(self, pragma):
         """
         Extracts the pragma type and FIFO depth from the given pragma.
@@ -1316,17 +1320,12 @@ def _get_pragma_details(self, pragma):
             fifo_depth = pragma[1]
         else:
             raise ValueError(f"Unexpected format for pragma: {pragma}")
-        
+
         return pragma_str, fifo_depth
-    
+
     def _print_status(self, status):
         print('\r', end='')
-        status_icons = {
-            'Pending': '○',
-            'Running': '⌛',
-            'Completed': '✅',
-            'Failed': '❌'
-        }
+        status_icons = {'Pending': '○', 'Running': '⌛', 'Completed': '✅', 'Failed': '❌'}
         status_str = ' | '.join(f'{proj}: {status_icons.get(stat, "?")}' for proj, stat in status.items())
         print(status_str, flush=True)
 
@@ -1335,30 +1334,28 @@ def _assert_consistent_pragmas(self):
         Ensure all graphs have the same pragma in their input and output layers.
         Stitching and simulating mixed pragmas is not supported at the moment.
         """
-        ref_pragmas = set(
-            self._get_pragma_details(self.graphs[0].output_vars[layer].pragma)[0] 
+        ref_pragmas = {
+            self._get_pragma_details(self.graphs[0].output_vars[layer].pragma)[0]
             for layer in self.graphs[0].inputs + self.graphs[0].outputs
             if layer in self.graphs[0].output_vars
-        )
+        }
 
         if len(ref_pragmas) != 1:
             raise ValueError(
                 f"Multiple pragmas detected in 1st graph: {ref_pragmas}. "
                 "Ensure all graphs have the same interface (stream or partition)."
             )
-                
+
         for idx, g in enumerate(self.graphs[1:], start=1):
-            current_pragmas = set(
-                self._get_pragma_details(g.output_vars[layer].pragma)[0] 
+            current_pragmas = {
+                self._get_pragma_details(g.output_vars[layer].pragma)[0]
                 for layer in g.inputs + g.outputs
                 if layer in g.output_vars
-            )
+            }
 
             if ref_pragmas != current_pragmas:
                 raise ValueError(
-                    f"Pragma mismatch in graph {idx}:\n"
-                    f"Expected: {ref_pragmas}\n"
-                    f"Found: {current_pragmas}"
+                    f"Pragma mismatch in graph {idx}:\n" f"Expected: {ref_pragmas}\n" f"Found: {current_pragmas}"
                 )
 
     def _replace_logos(self):
@@ -1372,15 +1369,11 @@ def _replace_logos(self):
         for g in self.graphs:
             graph_logo_paths = [
                 os.path.join(
-                    g.config.get_output_dir(),
-                    g.config.get_project_name() + '_prj',
-                    'solution1/impl/misc/logo.png'
+                    g.config.get_output_dir(), g.config.get_project_name() + '_prj', 'solution1/impl/misc/logo.png'
                 ),
                 os.path.join(
-                    g.config.get_output_dir(),
-                    g.config.get_project_name() + '_prj',
-                    'solution1/impl/ip/misc/logo.png'
-                )
+                    g.config.get_output_dir(), g.config.get_project_name() + '_prj', 'solution1/impl/ip/misc/logo.png'
+                ),
             ]
             try:
                 for logo in graph_logo_paths:
diff --git a/hls4ml/report/vivado_report.py b/hls4ml/report/vivado_report.py
index ec4182dcb4..ab5a24f147 100644
--- a/hls4ml/report/vivado_report.py
+++ b/hls4ml/report/vivado_report.py
@@ -684,10 +684,7 @@ def aggregate_graph_reports(graph_reports):
 
     keys_to_sum = ['BRAM_18K', 'DSP', 'FF', 'LUT', 'URAM']
     first_subgraph = next(iter(graph_reports))
-    reportChoice = (
-        'CSynthesisReport' if 'VivadoSynthReport' not in graph_reports[first_subgraph]
-        else 'VivadoSynthReport'
-    )
+    reportChoice = 'CSynthesisReport' if 'VivadoSynthReport' not in graph_reports[first_subgraph] else 'VivadoSynthReport'
     base_report = graph_reports[first_subgraph][reportChoice]
     csynth_report = graph_reports[first_subgraph].get('CSynthesisReport', base_report)
 
@@ -695,7 +692,7 @@ def aggregate_graph_reports(graph_reports):
         'TargetClockPeriod': csynth_report.get('TargetClockPeriod', 'N/A'),
         'EstimatedClockPeriod': float(csynth_report.get('EstimatedClockPeriod', float('inf'))),
         'BestLatency': 'N/A',
-        'WorstLatency': 'N/A'
+        'WorstLatency': 'N/A',
     }
 
     final_report['AvailableBRAM_18K'] = csynth_report.get('AvailableBRAM_18K', 'N/A')
@@ -724,4 +721,4 @@ def aggregate_graph_reports(graph_reports):
     for k in keys_to_sum:
         final_report[k] = str(final_report[k])
 
-    return {'StitchedDesignReport': final_report} 
+    return {'StitchedDesignReport': final_report}
diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
index 4621a2d2f7..0bb2c76c29 100644
--- a/hls4ml/templates/vivado/build_lib_multigraph.sh
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -27,7 +27,7 @@ for g in "${graph_project_names[@]}"; do
     WEIGHTS_DIR="\"${BASEDIR}/${g}/firmware/weights\""
     SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
     OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
-    
+
     ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
     OBJECT_FILES+=("${OBJ_FILE}")
     INCFLAGS+="-I${BASEDIR}/${g}/ "
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 9b6efe1b2c..4228c00174 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -41,7 +41,7 @@ set original_project_name $opt(original_project_name)
 set base_dir [pwd]
 set original_project_path "$base_dir/../../"
 puts $base_dir
-# Name of the block design 
+# Name of the block design
 set bd_name "stitched_design"
 
 # Find a directory that ends with "graph1", "graph2", etc. in the parent project folder
@@ -86,7 +86,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         if {[file isdirectory $repo_path]} {
             # Add repository path to current project's IP repository paths
             set_property ip_repo_paths [concat [get_property ip_repo_paths [current_project]] $repo_path] [current_project]
-            
+
             # Increment the repo count
             incr repo_count
 
@@ -156,7 +156,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         set rst_port_name "ap_rst"
         create_bd_port -dir I -type rst $rst_port_name
         set ap_rst_port [get_bd_ports ap_rst]
-        
+
         # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
         if {$rst_polarity ne ""} {
             set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
@@ -415,7 +415,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         set associated_busif [join [concat $input_pin_names $output_pin_names] ":"]
         set_property CONFIG.ASSOCIATED_BUSIF {$associated_busif} [get_bd_ports /ap_clk]
         set_property CONFIG.ASSOCIATED_RESET $rst_port_name [get_bd_ports /ap_clk]
-        
+
         # Make external the 'ap_done' signal of the last IP
         set last_ip_pins [get_bd_pins -of $last_ip_cell]
         set last_ap_done_pin ""
@@ -433,7 +433,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         } else {
             puts "Warning: Could not find 'ap_done' pin for last IP"
         }
-        
+
     } elseif {$interface_type == "unpacked"} {
         # Make 'ap_start' of the first IP external
         set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
@@ -526,7 +526,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
 
     save_bd_design
 
-    puts "###########################################################"                                     
+    puts "###########################################################"
     puts "#   Successfully connected the ports of each IP instance   "
     puts "#   A total of $repo_count IPs were connected.             "
     puts "###########################################################"
@@ -553,13 +553,13 @@ if {$stitch_design} {
 }
 
 if {$export_design} {
-    set start_time [clock seconds] 
+    set start_time [clock seconds]
     puts "Exporting stitched IP..."
     set stitched_ip_dir "ip_repo"
     ipx::package_project -root_dir $stitched_ip_dir \
         -vendor user.org -library user -taxonomy /UserIP -module $bd_name \
         -import_files
-    set_property description "This IP core integrates all NN subgraph IPs into one." [ipx::find_open_core user.org:user:stitched_design:1.0]   
+    set_property description "This IP core integrates all NN subgraph IPs into one." [ipx::find_open_core user.org:user:stitched_design:1.0]
     set_property core_revision 2 [ipx::find_open_core user.org:user:stitched_design:1.0]
     ipx::create_xgui_files [ipx::find_open_core user.org:user:stitched_design:1.0]
     ipx::update_checksums [ipx::find_open_core user.org:user:stitched_design:1.0]
@@ -572,7 +572,7 @@ if {$export_design} {
 }
 
 if {$sim_design} {
-    set start_time [clock seconds] 
+    set start_time [clock seconds]
     if {$sim_verilog_file == ""} {
         puts "Error: sim_verilog_file not provided."
         exit 1
@@ -613,7 +613,3 @@ if {$sim_design} {
 
 
 close_project
-
-
-
-
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 162c06f9e8..6ed266afb4 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -1,8 +1,10 @@
-import os
-from lxml import etree
 import json
+import os
+
 import numpy as np
-import pandas as pd 
+import pandas as pd
+from lxml import etree
+
 
 def parse_component_xml(component_xml_path):
     """
@@ -24,7 +26,7 @@ def parse_component_xml(component_xml_path):
     ns = {
         'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
         'xilinx': 'http://www.xilinx.com',
-        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
     }
 
     # Extract ports
@@ -81,33 +83,33 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         output_signals.append((output_item['name'], total_bits))
 
     with open(testbench_output_path, 'w') as f:
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Header and Module Declaration
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('`timescale 1ns / 1ps\n\n')
         f.write('module tb_design_1_wrapper;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Clock and Reset Signals
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock and Reset Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg ap_clk;\n')
         f.write('    reg ap_rst_n;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Control and Handshaking Signals
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Control and Handshaking Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg  ap_start;\n')
         f.write('    wire ap_done;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # AXI4-Stream Input Interfaces
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // AXI4-Stream Input Interfaces\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -119,9 +121,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'    reg  {layer["name"]}_tvalid;\n')
             f.write(f'    wire {layer["name"]}_tready;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # AXI4-Stream Output Interfaces
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // AXI4-Stream Output Interfaces\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -133,9 +135,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'    wire {layer["name"]}_tvalid;\n')
             f.write(f'    reg  {layer["name"]}_tready;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # DUT Instantiation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // DUT Instantiation\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -167,9 +169,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write(f'        .{name}_tvalid({name}_tvalid)\n')
         f.write('    );\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Clock Generation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock Generation (100 MHz => 10 ns period)\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -178,9 +180,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        forever #5 ap_clk = ~ap_clk;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Reset Generation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Reset Generation\n')
         f.write('    // Wait for a few cycles and then release reset.\n')
@@ -191,9 +193,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        ap_rst_n = 1;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Signal Initialization
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Signal Initialization\n')
         f.write('    // Initialize control signals, input valid, and output ready.\n')
@@ -206,9 +208,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'        {name}_tready = 1;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Variables for Logging and Measurement
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Logging and Measurement Variables\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -222,9 +224,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('    reg [1:0] done_counter = 0;\n')
         f.write('    reg       old_ap_done = 0;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Cycle Counting
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Cycle Counting\n')
         f.write('    // Count cycles to measure latency.\n')
@@ -236,9 +238,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('            cycle_count <= cycle_count + 1;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Data Transmission (Stimulus Generation)
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Data Transmission (Stimulus)\n')
         f.write('    // Send input patterns to the DUT.\n')
@@ -308,9 +310,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
 
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Output Data Capture and Logging
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Output Data Capture and Logging\n')
         f.write('    // Capture output for 2nd input (done_counter == 1) and log them to CSV.\n')
@@ -331,16 +333,18 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write(f'            for (idx_{i} = 0; idx_{i} < {layer["batch_size"]}; idx_{i} = idx_{i} + 1) begin\n')
             f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
             f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-            f.write(f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
+            f.write(
+                f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n'
+            )
             f.write('                // Log result to CSV\n')
             f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
             f.write('            end\n')
             f.write('        end\n')
             f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Latency Measurement and Test End
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Latency Measurement\n')
         f.write('    // Measures the cycle count between start and subsequent ap_done signals.\n')
@@ -370,6 +374,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
 
         f.write('endmodule\n')
 
+
 def float_to_fixed(float_value, integer_bits=6, fractional_bits=10):
     scaling_factor = 1 << fractional_bits
     total_bits = integer_bits + fractional_bits
@@ -386,6 +391,7 @@ def float_to_fixed(float_value, integer_bits=6, fractional_bits=10):
 
     return fixed_value
 
+
 def write_testbench_input(float_inputs, file_name, integer_bits=6, fractional_bits=10):
     """
     Convert 1D or 2D arrays (or lists of floats) to fixed-point and write to file.
@@ -406,25 +412,25 @@ def write_testbench_input(float_inputs, file_name, integer_bits=6, fractional_bi
 
 
 def prepare_zero_input(layer):
-        batch_size = layer['batch_size']
-        fifo_depth = layer['fifo_depth']       
-        zero_input = np.zeros((fifo_depth, batch_size), dtype=np.int32)
-        return zero_input
+    batch_size = layer['batch_size']
+    fifo_depth = layer['fifo_depth']
+    zero_input = np.zeros((fifo_depth, batch_size), dtype=np.int32)
+    return zero_input
+
 
 def prepare_testbench_input(data, fifo_depth, batch_size):
     data_arr = np.array(data)
     # Ensure that total elements = fifo_depth * batch_size
     total_elements = fifo_depth * batch_size
     if data_arr.size != total_elements:
-        raise ValueError(
-            f"Data size {data_arr.size} does not match fifo_depth * batch_size = {total_elements}"
-        )
+        raise ValueError(f"Data size {data_arr.size} does not match fifo_depth * batch_size = {total_elements}")
     data_reshaped = data_arr.reshape((fifo_depth, batch_size))
     return data_reshaped
 
+
 def read_testbench_log(testbench_log_path):
     """
-    Reads the testbench log file and returns a dictionary 
+    Reads the testbench log file and returns a dictionary
     """
     if not os.path.exists(testbench_log_path):
         print(f"Error: The file '{testbench_log_path}' does not exist.")
@@ -435,12 +441,8 @@ def read_testbench_log(testbench_log_path):
         BestLatency = df[df['output_name'] == 'BestLatency']['value'].iloc[0]
         WorstLatency = df[df['output_name'] == 'WorstLatency']['value'].iloc[0]
         output_df = df[~df['output_name'].isin(['BestLatency', 'WorstLatency'])]
-        
-        sim_dict = {
-            'BestLatency': int(BestLatency),
-            'WorstLatency': int(WorstLatency),
-            'BehavSimResults': []
-        }
+
+        sim_dict = {'BestLatency': int(BestLatency), 'WorstLatency': int(WorstLatency), 'BehavSimResults': []}
 
         grouped = output_df.groupby('output_name')
         for name, group in grouped:

From 89f5eb3ef0e3cbafda5cd0fbdf8416d67e8a76cb Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 28 Jan 2025 17:41:46 +0100
Subject: [PATCH 39/50] fix 'ap_rst' port polarity for active high case

---
 hls4ml/templates/vivado/ip_stitcher.tcl | 46 +++++++++++++++----------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 4228c00174..db19e27046 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -152,25 +152,35 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         # Get the CONFIG.POLARITY property from one of the IP's 'ap_rst' pins
         set sample_rst_pin [lindex $ap_rst_ports 0]
         set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
-        # Create the 'ap_rst' port
-        set rst_port_name "ap_rst"
-        create_bd_port -dir I -type rst $rst_port_name
-        set ap_rst_port [get_bd_ports ap_rst]
 
-        # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
+        # Only proceed if the polarity is defined
         if {$rst_polarity ne ""} {
+            # Create the 'ap_rst' port
+            set rst_port_name "ap_rst"
+            create_bd_port -dir I -type rst $rst_port_name
+            set ap_rst_port [get_bd_ports ap_rst]
+
+            # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
             set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
-            # naming convention for active-low signals
-            set rst_port_name "ap_rst_n"
-            set_property NAME $rst_port_name $ap_rst_port
+            
+            # Rename the port based on polarity
+            if {$rst_polarity eq "ACTIVE_LOW"} {
+                set rst_port_name "ap_rst_n"
+                set_property NAME $rst_port_name $ap_rst_port
+                puts "Setting reset port ap_rst_n (ACTIVE_LOW)."
+            } else {
+                puts "Setting reset port ap_rst (ACTIVE_HIGH)."
+            }
+            # Connect all 'ap_rst' pins to the 'ap_rst' port
+            foreach rst_pin $ap_rst_ports {
+                connect_bd_net $ap_rst_port $rst_pin
+            }
         } else {
-            # Fallback to ACTIVE_HIGH if the retrieved polarity is not defined
-            set_property CONFIG.POLARITY ACTIVE_HIGH $ap_rst_port
-        }
-        # Connect all 'ap_rst' pins to the 'ap_rst' port
-        foreach rst_pin $ap_rst_ports {
-            connect_bd_net $ap_rst_port $rst_pin
+            # Fallback: Undefined polarity, no port created
+            puts "Warning: CONFIG.POLARITY of ap_rst is undefined. No reset port created."
         }
+    } else {
+        puts "Error: No reset ports found."
     }
 
     # Determine interface type
@@ -185,7 +195,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
             set interface_type "axi_stream"
             break
         } elseif {[regexp {^layer(?:\d+_)?out_(\d+)$} $port_name]} {
-            set interface_type "unpacked"
+            set interface_type "partition"
             break
         }
     }
@@ -220,8 +230,8 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         set ip_i_cell [get_bd_cells $ip_i]
         set ip_i_plus1_cell [get_bd_cells $ip_i_plus1]
 
-        if {$interface_type == "unpacked"} {
-            # Existing unpacked interface connection logic
+        if {$interface_type == "partition"} {
+            # Existing partitioned interface connection logic
             # Get all output pins from ip_i
             set output_ports [get_bd_pins -of $ip_i_cell]
 
@@ -434,7 +444,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
             puts "Warning: Could not find 'ap_done' pin for last IP"
         }
 
-    } elseif {$interface_type == "unpacked"} {
+    } elseif {$interface_type == "partition"} {
         # Make 'ap_start' of the first IP external
         set first_ip_cell [get_bd_cells [lindex $ip_instances 0]]
         if {[string length $first_ip_cell] == 0} {

From e21cb53c663aaddba9831b95673778b440f7c6c3 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 29 Jan 2025 12:38:01 +0100
Subject: [PATCH 40/50] support for partition interface in verilog testbench

---
 hls4ml/utils/simulation_utils.py | 363 +++++++++++++++++++++----------
 1 file changed, 244 insertions(+), 119 deletions(-)

diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 6ed266afb4..e9a4429d9e 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -61,117 +61,161 @@ def write_verilog_testbench(nn_config, testbench_output_path):
     Generate a Verilog testbench for a given neural network configuration.
     The testbench includes:
       - Clock and reset logic
-      - DUT instantiation and AXI4-Stream interfaces
+      - DUT instantiation and AXI4-Stream/Partition interfaces
       - Stimulus generation for inputs
       - Data capture and logging for outputs
       - Latency measurement
     """
     inputs = nn_config['inputs']
     outputs = nn_config['outputs']
-
-    input_signals = []
-    output_signals = []
-
-    # Collect input signals (name and total bitwidth)
-    for input_item in inputs:
-        total_bits = input_item['integer_bits'] + input_item['fractional_bits']
-        input_signals.append((input_item['name'], total_bits))
-
-    # Collect output signals (name and total bitwidth)
-    for output_item in outputs:
-        total_bits = output_item['integer_bits'] + output_item['fractional_bits']
-        output_signals.append((output_item['name'], total_bits))
+    pragma = nn_config['inputs'][0]['pragma']
+    # NOTE we usually have active-low in stream interfaces and active-high in partitioned interfaces.
+    rst_name = 'ap_rst_n' if pragma == 'stream' else 'ap_rst'
 
     with open(testbench_output_path, 'w') as f:
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Header and Module Declaration
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('`timescale 1ns / 1ps\n\n')
         f.write('module tb_design_1_wrapper;\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Clock and Reset Signals
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock and Reset Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg ap_clk;\n')
-        f.write('    reg ap_rst_n;\n\n')
+        f.write(f'    reg {rst_name};\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Control and Handshaking Signals
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Control and Handshaking Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg  ap_start;\n')
         f.write('    wire ap_done;\n\n')
 
-        # ----------------------------------------------------------------------
-        # AXI4-Stream Input Interfaces
-        # ----------------------------------------------------------------------
-        f.write('    //------------------------------------------------------------------------\n')
-        f.write('    // AXI4-Stream Input Interfaces\n')
-        f.write('    //------------------------------------------------------------------------\n')
-
-        for layer in nn_config['inputs']:
-            total_bits = layer['integer_bits'] + layer['fractional_bits']
-            batch_size = layer['batch_size']
-            f.write(f'    reg  [{(total_bits * batch_size) - 1}:0] {layer["name"]}_tdata;\n')
-            f.write(f'    reg  {layer["name"]}_tvalid;\n')
-            f.write(f'    wire {layer["name"]}_tready;\n\n')
-
-        # ----------------------------------------------------------------------
-        # AXI4-Stream Output Interfaces
-        # ----------------------------------------------------------------------
-        f.write('    //------------------------------------------------------------------------\n')
-        f.write('    // AXI4-Stream Output Interfaces\n')
-        f.write('    //------------------------------------------------------------------------\n')
-
-        for layer in nn_config['outputs']:
-            total_bits = layer['integer_bits'] + layer['fractional_bits']
-            batch_size = layer['batch_size']
-            f.write(f'    wire [{(total_bits * batch_size) - 1}:0] {layer["name"]}_tdata;\n')
-            f.write(f'    wire {layer["name"]}_tvalid;\n')
-            f.write(f'    reg  {layer["name"]}_tready;\n\n')
-
-        # ----------------------------------------------------------------------
+        if(pragma == 'stream'):
+            #----------------------------------------------------------------------
+            # AXI4-Stream Input Interfaces
+            #----------------------------------------------------------------------
+            f.write('    //------------------------------------------------------------------------\n')
+            f.write('    // AXI4-Stream Input Interfaces\n')
+            f.write('    //------------------------------------------------------------------------\n')
+
+            for layer in nn_config['inputs']:
+                name = layer["name"]
+                total_bits = layer['integer_bits'] + layer['fractional_bits']
+                batch_size = layer['batch_size']
+                f.write(f'    reg  [{(total_bits * batch_size) - 1}:0] {name}_tdata;\n')
+                f.write(f'    reg  {name}_tvalid;\n')
+                f.write(f'    wire {name}_tready;\n\n')
+
+            #----------------------------------------------------------------------
+            # AXI4-Stream Output Interfaces
+            #----------------------------------------------------------------------
+            f.write('    //------------------------------------------------------------------------\n')
+            f.write('    // AXI4-Stream Output Interfaces\n')
+            f.write('    //------------------------------------------------------------------------\n')
+
+            for layer in nn_config['outputs']:
+                name = layer["name"]
+                total_bits = layer['integer_bits'] + layer['fractional_bits']
+                batch_size = layer['batch_size']
+                f.write(f'    wire [{(total_bits * batch_size) - 1}:0] {name}_tdata;\n')
+                f.write(f'    wire {name}_tvalid;\n')
+                f.write(f'    reg  {name}_tready;\n\n')
+        else:
+            #----------------------------------------------------------------------
+            # Partitioned Input Interfaces
+            #----------------------------------------------------------------------
+            f.write('    //------------------------------------------------------------------------\n')
+            f.write('    // Partitioned Input Interfaces\n')
+            f.write('    //------------------------------------------------------------------------\n')
+
+            for layer in nn_config['inputs']:
+                name = layer["name"]
+                total_bits = layer['integer_bits'] + layer['fractional_bits']
+                batch_size = layer['batch_size']
+                for idx in range(batch_size):
+                    f.write(f'    reg  [{total_bits - 1}:0] {name}_{idx};\n')
+                    f.write(f'    reg {name}_{idx}_ap_vld;\n')
+
+            #----------------------------------------------------------------------
+            # Partitioned Output Interfaces
+            #----------------------------------------------------------------------
+            f.write('    //------------------------------------------------------------------------\n')
+            f.write('    // Partitioned Output Interfaces\n')
+            f.write('    //------------------------------------------------------------------------\n')
+
+            for layer in nn_config['outputs']:
+                name = layer["name"]
+                total_bits = layer['integer_bits'] + layer['fractional_bits']
+                batch_size = layer['batch_size']
+                for idx in range(batch_size):
+                    f.write(f'    wire [{total_bits - 1}:0] {name}_{idx};\n')
+                    f.write(f'    wire {name}_{idx}_ap_vld;\n')
+
+        #----------------------------------------------------------------------
         # DUT Instantiation
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // DUT Instantiation\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    stitched_design dut (\n')
         f.write('        .ap_clk(ap_clk),\n')
         f.write('        .ap_done(ap_done),\n')
-        f.write('        .ap_rst_n(ap_rst_n),\n')
+        f.write(f'        .{rst_name}({rst_name}),\n')
         f.write('        .ap_start(ap_start),\n')
 
         # Connect input interfaces
         for layer in nn_config['inputs']:
             name = layer["name"]
-            f.write(f'        .{name}_tdata({name}_tdata),\n')
-            f.write(f'        .{name}_tready({name}_tready),\n')
-            f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+            batch_size = layer['batch_size']
+            if (pragma == 'stream'):
+                f.write(f'        .{name}_tdata({name}_tdata),\n')
+                f.write(f'        .{name}_tready({name}_tready),\n')
+                f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+            else:
+                for idx in range(batch_size):
+                    f.write(f'        .{name}_{idx}({name}_{idx}),\n')
+                    f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld),\n')
 
         # Connect output interfaces (all but last have trailing comma)
         for layer in nn_config['outputs'][:-1]:
             name = layer["name"]
-            f.write(f'        .{name}_tdata({name}_tdata),\n')
-            f.write(f'        .{name}_tready({name}_tready),\n')
-            f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+            batch_size = layer['batch_size']
+            if (pragma == 'stream'):
+                f.write(f'        .{name}_tdata({name}_tdata),\n')
+                f.write(f'        .{name}_tready({name}_tready),\n')
+                f.write(f'        .{name}_tvalid({name}_tvalid),\n')
+            else:
+                for idx in range(batch_size):
+                    f.write(f'        .{name}_{idx}({name}_{idx}),\n')
+                    f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld),\n')
 
         # Last output interface (no trailing comma)
         last_output_layer = nn_config['outputs'][-1]
         name = last_output_layer["name"]
-        f.write(f'        .{name}_tdata({name}_tdata),\n')
-        f.write(f'        .{name}_tready({name}_tready),\n')
-        f.write(f'        .{name}_tvalid({name}_tvalid)\n')
+        batch_size = last_output_layer['batch_size']
+        if (pragma == 'stream'):
+            f.write(f'        .{name}_tdata({name}_tdata),\n')
+            f.write(f'        .{name}_tready({name}_tready),\n')
+            f.write(f'        .{name}_tvalid({name}_tvalid)\n')
+        else:
+            for idx in range(batch_size):
+                    f.write(f'        .{name}_{idx}({name}_{idx}),\n')
+                    if idx < batch_size - 1:
+                        f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld),\n')
+                    else:
+                        f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld)\n')
         f.write('    );\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Clock Generation
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock Generation (100 MHz => 10 ns period)\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -180,37 +224,51 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        forever #5 ap_clk = ~ap_clk;\n')
         f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Reset Generation
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Reset Generation\n')
-        f.write('    // Wait for a few cycles and then release reset.\n')
+        f.write('    // Wait for a cycle and then release reset.\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
-        f.write('        ap_rst_n = 0;\n')
-        f.write('        repeat (5) @(posedge ap_clk);\n')
-        f.write('        ap_rst_n = 1;\n')
+        if rst_name == 'ap_rst_n':
+            f.write(f'        {rst_name} = 0;\n')
+            f.write('        repeat (1) @(posedge ap_clk);\n')
+            f.write(f'        {rst_name} = 1;\n')
+        else:
+            f.write(f'        {rst_name} = 1;\n')
+            f.write('        repeat (1) @(posedge ap_clk);\n')
+            f.write(f'        {rst_name} = 0;\n')           
         f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Signal Initialization
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Signal Initialization\n')
         f.write('    // Initialize control signals, input valid, and output ready.\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
         f.write('        ap_start = 0;\n')
-        for name, _ in input_signals:
-            f.write(f'        {name}_tvalid = 0;\n')
-        for name, _ in output_signals:
-            f.write(f'        {name}_tready = 1;\n')
+
+        for layer in nn_config['inputs']:
+                name = layer['name']
+                batch_size = layer['batch_size']
+                if pragma == 'stream':
+                    f.write(f'        {name}_tvalid = 0;\n')
+                else:
+                    for idx in range(batch_size):
+                        f.write(f'        {name}_{idx}_ap_vld = 0;\n')
+        if pragma == 'stream':
+            for layer in nn_config['outputs']:
+                name = layer['name']
+                f.write(f'        {name}_tready = 1;\n')
         f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Variables for Logging and Measurement
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Logging and Measurement Variables\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -222,33 +280,39 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('    reg [63:0] start_cycle = 0;\n')
         f.write('    reg [63:0] end_cycle = 0;\n')
         f.write('    reg [1:0] done_counter = 0;\n')
-        f.write('    reg       old_ap_done = 0;\n\n')
+        f.write('    reg old_ap_done = 0;\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Cycle Counting
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Cycle Counting\n')
         f.write('    // Count cycles to measure latency.\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    always @(posedge ap_clk) begin\n')
-        f.write('        if (!ap_rst_n)\n')
-        f.write('            cycle_count <= 0;\n')
+        if rst_name == 'ap_rst_n':
+            f.write(f'        if (!{rst_name})\n')
+            f.write('            cycle_count <= 0;\n')
+        else:
+            f.write(f'        if ({rst_name})\n')
+            f.write('            cycle_count <= 0;\n')           
         f.write('        else\n')
         f.write('            cycle_count <= cycle_count + 1;\n')
         f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Data Transmission (Stimulus Generation)
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Data Transmission (Stimulus)\n')
         f.write('    // Send input patterns to the DUT.\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    initial begin\n')
         f.write('        // Wait until reset is de-asserted\n')
-        f.write('        wait (ap_rst_n == 1);\n')
-        f.write('        repeat (2) @(posedge ap_clk);\n\n')
+        if rst_name == 'ap_rst_n':
+            f.write(f'        wait ({rst_name} == 1);\n')
+        else:
+            f.write(f'        wait ({rst_name} == 0);\n')            
 
         f.write('        // Open CSV log file\n')
         f.write('        csv_file = $fopen("../../../../testbench_log.csv", "w");\n')
@@ -258,10 +322,13 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        end\n')
         f.write('        $fwrite(csv_file, "output_name,index,value\\n");\n\n')
 
-        f.write('        // Start the DUT\n')
-        f.write('        ap_start = 1;\n\n')
+        if pragma == 'stream':
+            f.write('        // Start the DUT\n')
+            f.write('        ap_start = 1;\n\n')
 
-        # Send first pattern of inputs (all zeroes)
+        #----------------------------------------------------------------------
+        # Sending first pattern of inputs (all zeroes)
+        #----------------------------------------------------------------------
         for layer in nn_config['inputs']:
             i_bits = layer["integer_bits"]
             f_bits = layer["fractional_bits"]
@@ -269,19 +336,44 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             batch_size = layer['batch_size']
             fifo_depth = layer["fifo_depth"]
             name = layer["name"]
-            f.write(f'        // Sending 1st patern of inputs for {name}\n')
-            f.write(f'        {name}_tvalid = 1;\n')
+            f.write(f'        // Sending first patern of inputs for {name}\n')
+            if pragma == 'stream':
+                f.write(f'        {name}_tvalid = 1;\n')
             f.write(f'        for (j = 0; j < {fifo_depth}; j = j + 1) begin\n')
             for k in range(batch_size):
                 upper = (k + 1) * total_bits - 1
                 lower = k * total_bits
-                f.write(f'            {name}_tdata[{upper}:{lower}] = 0;\n')
-            f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
-            f.write('            @(posedge ap_clk);\n')
+                if pragma == 'stream':
+                    f.write(f'            {name}_tdata[{upper}:{lower}] = 0;\n')
+                else:
+                    f.write(f'            {name}_{k} = 0;\n')
+            if pragma == 'stream':
+                f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
+                f.write('            @(posedge ap_clk);\n')
             f.write('        end\n')
-            f.write(f'        {name}_tvalid = 0;\n\n')
-
-        # Send second pattern of inputs (read from file)
+            if pragma == 'stream':
+                f.write(f'        {name}_tvalid = 0;\n\n')
+            else:
+                f.write(f'        // Assert valid signals\n')
+                for k in range(batch_size):
+                   f.write(f'        {name}_{k}_ap_vld = 1;\n')
+                f.write('        // Start the DUT\n')
+                f.write(f'        ap_start = 1;\n')  
+                f.write(f'        @(posedge ap_clk);\n')
+                f.write(f'        ap_start = 0;\n') 
+                f.write(f'        // Deassert valid signals\n')
+                for k in range(batch_size):
+                   f.write(f'        {name}_{k}_ap_vld = 0;\n') 
+                f.write(f'\n') 
+                f.write(f'        // Wait for ap_done to go high\n')
+                f.write(f'        wait (ap_done);\n')
+                f.write(f'        // Wait for ap_done to go low before sending next input\n')
+                f.write(f'        wait (!ap_done);\n')
+                f.write(f'        // Wait for ap_done to go high\n')
+
+        #----------------------------------------------------------------------
+        # Sending second pattern of inputs (read from file)
+        #----------------------------------------------------------------------
         for layer in nn_config['inputs']:
             i_bits = layer["integer_bits"]
             f_bits = layer["fractional_bits"]
@@ -290,8 +382,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             fifo_depth = layer["fifo_depth"]
             name = layer["name"]
             input_file = f"{name}_input_data.txt"
-            f.write(f'        // Sending 2nd pattern of inputs for {name}\n')
-            f.write(f'        {name}_tvalid = 1;\n')
+            f.write(f'        // Sending second pattern of inputs for {name}\n')
+            if pragma == 'stream':
+                f.write(f'        {name}_tvalid = 1;\n')
             f.write(f'        file = $fopen("../../../../{input_file}", "r");\n')
             f.write(f'        if (file == 0) begin\n')
             f.write(f'            $display("Error opening file {input_file}");\n')
@@ -303,16 +396,32 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 upper = (k + 1) * total_bits - 1
                 lower = k * total_bits
                 f.write(f'            r = $fscanf(file, "%d", value);\n')
-                f.write(f'            {name}_tdata[{upper}:{lower}] = value;\n')
-            f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
-            f.write('            @(posedge ap_clk);\n')
+                if pragma == 'stream':
+                    f.write(f'            {name}_tdata[{upper}:{lower}] = value;\n')
+                else:
+                    f.write(f'            {name}_{k} = value;\n')
+            if pragma == 'stream':    
+                f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
+                f.write('            @(posedge ap_clk);\n')
             f.write('        end\n')
+            if pragma == 'partition':
+                f.write(f'        // Assert valid signals\n')
+                for k in range(batch_size):
+                   f.write(f'        {name}_{k}_ap_vld = 1;\n')
+                f.write('        // Start the DUT\n')
+                f.write(f'        ap_start = 1;\n')  
+                f.write(f'        @(posedge ap_clk);\n')
+                f.write(f'        ap_start = 0;\n') 
+                f.write(f'        // Deassert valid signals\n')
+                for k in range(batch_size):
+                   f.write(f'        {name}_{k}_ap_vld = 0;\n') 
+                f.write(f'\n') 
 
         f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Output Data Capture and Logging
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Output Data Capture and Logging\n')
         f.write('    // Capture output for 2nd input (done_counter == 1) and log them to CSV.\n')
@@ -323,34 +432,50 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f_bits = layer['fractional_bits']
             total_bits = i_bits + f_bits
             layer_name = layer["name"]
+            batch_size = layer["batch_size"]
 
             f.write(f'    //Output capture for {layer_name}\n')
             f.write(f'    integer idx_{i};\n')
-            f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
-            f.write(f'    real real_val_{i};\n')
+            if pragma == 'stream':
+                f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
+                f.write(f'    real real_val_{i};\n')
+            else:
+                f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
+                f.write(f'    real real_val_{i};\n')               
             f.write(f'    always @(posedge ap_clk) begin\n')
-            f.write(f'        if (done_counter == 1 && {layer_name}_tvalid && {layer_name}_tready) begin\n')
-            f.write(f'            for (idx_{i} = 0; idx_{i} < {layer["batch_size"]}; idx_{i} = idx_{i} + 1) begin\n')
-            f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
-            f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-            f.write(
-                f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n'
-            )
-            f.write('                // Log result to CSV\n')
-            f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
-            f.write('            end\n')
+            if pragma == 'stream':
+                f.write(f'        if (done_counter == 1 && {layer_name}_tvalid && {layer_name}_tready) begin\n')
+                f.write(f'            for (idx_{i} = 0; idx_{i} < {batch_size}; idx_{i} = idx_{i} + 1) begin\n')
+                f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
+                f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
+                f.write(f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
+                f.write('                // Log result to CSV\n')
+                f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
+                f.write('            end\n')
+            else:
+                f.write(f'        // Note: The expected behavior in most cases is to have valid outputs (ap_vld=1) when ap_done = 1\n')
+                f.write(f'        if (done_counter == 1 && ap_done == 1) begin\n')
+                for idx in range (batch_size):
+                    f.write(f'            fixed_val_{i} = {layer_name}_{idx}[{total_bits - 1}:0];\n')
+                    f.write(f'            real_val_{i} = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
+                    f.write(f'            $display("Output {layer_name}_{idx}: integer_bits=%0d fractional_bits=%0d value=%f", {i_bits}, {f_bits}, real_val_{i});\n')
+                    f.write('            // Log result to CSV\n')
+                    f.write(f'            $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", {idx}, real_val_{i});\n')        
             f.write('        end\n')
             f.write('    end\n\n')
 
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         # Latency Measurement and Test End
-        # ----------------------------------------------------------------------
+        #----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Latency Measurement\n')
         f.write('    // Measures the cycle count between start and subsequent ap_done signals.\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    always @(posedge ap_clk) begin\n')
-        f.write('        if (!ap_rst_n) begin\n')
+        if rst_name == 'ap_rst_n':
+            f.write(f'        if (!{rst_name}) begin\n')
+        else:
+             f.write(f'        if ({rst_name}) begin\n')           
         f.write('            old_ap_done <= 0;\n')
         f.write('        end else begin\n')
         f.write('            old_ap_done <= ap_done;\n')

From e070ea10e21605887391802883ac5e572c5fc5ce Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 14 Feb 2025 17:37:59 +0100
Subject: [PATCH 41/50] support for MultiModelGraph predict using chained
 bridge file

---
 hls4ml/model/graph.py                         | 222 +++++-------------
 .../templates/vivado/build_lib_multigraph.sh  |   6 +-
 .../vivado/myproject_bridge_multigraph.cpp    |  70 ------
 hls4ml/writer/vivado_writer.py                | 201 ++++++++++++++--
 4 files changed, 246 insertions(+), 253 deletions(-)
 delete mode 100644 hls4ml/templates/vivado/myproject_bridge_multigraph.cpp

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index aa2bf815a2..4c8cdb6677 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -981,10 +981,9 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
                     if previous_layer_name in sub_config['HLSConfig']['LayerName']:
                         prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
                         new_layer_config = {}
-                        new_layer_config['Precision'] = prev_layer_config['Precision']
+                        new_layer_config['Precision'] =  last_output_precision if last_output_precision is not None else 'auto'
                         # NOTE - We copy Trace as well but it might be better to reset it
                         new_layer_config['Trace'] = prev_layer_config['Trace']
-                        # copy last layer config from previous graph to the new input layer config of current graph
                         sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
                     else:
                         raise KeyError(f"Layer '{previous_layer_name}' not found in subconfig.")
@@ -997,23 +996,19 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
                 sub_config, sub_layer_list, graph_input_layers, graph_output_layers, initial_index=current_index
             )
 
-            # After creating subgraph, get the precision from the last layer's output.
+            # After creating the subgraph, extract the actual precision from the last layer's result.
             if hls_model.graph:
-                try:
-                    last_layer = next(reversed(hls_model.graph.values()))
-                    last_output_precision = last_layer.attributes['precision']['result']
-                except (KeyError, AttributeError):
-                    warnings.warn(
-                        "Could not find precision in the last layer. " "Setting 'last_output_precision' to 'auto'."
-                    )
-                    last_output_precision = 'auto'
+                last_layer = next(reversed(hls_model.graph.values()))
+                last_prec = last_layer.attributes.get('result_t')
+                last_output_precision = (last_prec.precision if hasattr(last_prec, 'precision') else last_prec) if last_prec is not None else 'auto'
+                if last_output_precision == 'auto' or last_output_precision is None:
+                    raise ValueError("Could not extract a valid precision from the last layer!")
 
-            # Update the current index for the next graph
-            # Get the index of the last element in the graph
+            # Update current_index based on the new graph (accounting for the inserted input layer).
             layer_indices = [layer.index for layer in hls_model.graph.values()]
             if layer_indices:
                 max_index = max(layer_indices)
-                current_index = max_index - 1  # we have the new input layer as well
+                current_index = max_index - 1
             model_graphs.append(hls_model)
 
         return MultiModelGraph(model_graphs)
@@ -1022,32 +1017,45 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
 class MultiModelGraph:
     def __init__(self, graphs):
         self.graphs = graphs
-        self.config = copy.copy(self.graphs[0].config)
-        self._deepcopy_config_names(self.graphs[0].config.config)
         self._initialize_config(graphs[0])
-        self.config.config['StitchedProjectName'] = 'vivado_stitched_design'
-        self.backend = graphs[0].config.backend
+        self._bind_modelgraph_methods()
+        self._initialize_io_attributes(graphs)
+
+    def _initialize_config(self, first_graph):
+        self.config = copy.copy(first_graph.config)
+        # Deep copy only 'ProjectName' and 'OutputDir', shallow copy others
+        keys_to_deepcopy = ['ProjectName', 'OutputDir']
+        self.config.config = {
+            k: copy.deepcopy(first_graph.config.config[k]) if k in keys_to_deepcopy else first_graph.config.config[k] 
+            for k in first_graph.config.config
+        }
+        self._update_project_config(first_graph)
+        self.backend = first_graph.config.backend
+
+    def _bind_modelgraph_methods(self):
+        # Bind necessary ModelGraph methods to this instance
+        self._compile = ModelGraph._compile.__get__(self, MultiModelGraph)
+        self.get_output_variables = ModelGraph.get_output_variables.__get__(self, MultiModelGraph)
+        self._compute_n_samples = ModelGraph._compute_n_samples.__get__(self, MultiModelGraph)
+        self._get_top_function = ModelGraph._get_top_function.__get__(self, MultiModelGraph)
+        self._predict = ModelGraph.predict.__get__(self, MultiModelGraph)
+        self.trace = ModelGraph.trace.__get__(self, MultiModelGraph)
+
+    def _initialize_io_attributes(self, graphs):
         self.graph_reports = None
         self._top_function_lib = None
-        self.config.config['Stamp'] = '64616e'
         self.inputs = graphs[0].inputs
         self.outputs = graphs[-1].outputs
-        self._compile = ModelGraph._compile.__get__(self, MultiModelGraph)
+        self.output_vars = graphs[-1].output_vars        
 
-    def _initialize_config(self, first_graph):
-        """
-        Initialize the configuration using details from the first graph
-        """
+    def _update_project_config(self, first_graph):
         original_project_name = first_graph.config.get_project_name().partition('_graph')[0]
         self.config.config['ProjectName'] = f"{original_project_name}_stitched"
         self.config.config['OriginalProjectName'] = original_project_name
         original_output_dir = first_graph.config.get_output_dir().partition('/graph')[0]
         self.config.config['OutputDir'] = os.path.join(original_output_dir, 'stitched')
-
-    def _deepcopy_config_names(self, config):
-        # Deep copy only 'ProjectName' and 'OutputDir', shallow copy others
-        keys_to_deepcopy = ['ProjectName', 'OutputDir']
-        self.config.config = {k: copy.deepcopy(config[k]) if k in keys_to_deepcopy else config[k] for k in config}
+        self.config.config['StitchedProjectName'] = 'vivado_stitched_design'
+        self.config.config['Stamp'] = '64616e'
 
     def __getitem__(self, index):
         return self.graphs[index]
@@ -1140,6 +1148,9 @@ def build_wrapper(idx, g, **kwargs):
 
         if stitch_design or sim_stitched_design or export_stitched_design:
             self._assert_consistent_pragmas()
+            vivado_folder = os.path.join(self.config.config['OutputDir'], self.config.config['StitchedProjectName'])
+            if os.path.exists(vivado_folder):
+                raise FileExistsError(f"Vivado stitched project folder '{vivado_folder}' already exists.")
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
                 stitch_design=stitch_design,
@@ -1155,18 +1166,13 @@ def build_wrapper(idx, g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.compile()
-        # TODO
-        # self.write_build_script()
-        # self.write_bridge()
-        # self._compile()
+        # Bypass VitisWriter and invoke write_hls directly from VivadoWriter
+        super(self.backend.writer.__class__, self.backend.writer).write_hls(self, is_multigraph=True)
+        self._compile()
 
     def predict(self, x, sim='csim'):
         if sim == 'csim':
-            input_data = x
-            for g in self.graphs:
-                output_data = g.predict(input_data)
-                input_data = output_data
-            return output_data
+            return self._predict(x)
         elif sim == 'rtl':
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
@@ -1180,134 +1186,22 @@ def predict(self, x, sim='csim'):
             return stitched_report['BehavSimResults']
         else:
             print('Unknown simulation option given.')
-
+    
     def trace(self, x):
-        # TODO: finish trace function
-        input_data = x
-        trace_output = []
-        for g in self.graphs:
-            output_data, curr_trace_output = g.trace(input_data)
-            input_data = output_data
-            trace_output.append(curr_trace_output)
-        return output_data, trace_output
-
-    def write_build_script(self):
-        # NOTE we need to move this function to Vivado writer with each graph object
-        spec = importlib.util.find_spec('hls4ml')
-        hls4ml_path = os.path.dirname(spec.origin)
-        build_lib_src = os.path.join(hls4ml_path, 'templates/vivado/build_lib_multigraph.sh')
-        os.makedirs(self.config.config['OutputDir'], exist_ok=True)
-        build_lib_dst = os.path.join(self.config.config['OutputDir'], 'build_lib.sh')
-        graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in self.graphs)
-        with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
-            for line in src.readlines():
-                line = line.replace('myproject', self.config.config['OriginalProjectName'])
-                line = line.replace('myproject_stitched', self.config.config['ProjectName'])
-                line = line.replace('mystamp', self.config.config['Stamp'])
-                line = line.replace('mygraph_name_list', graph_project_names)
-                dst.write(line)
-        os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
-
-    def write_bridge(self):
-        # NOTE we need to move this function to Vivado writer with each graph object
-        """Write the Python-C++ bridge (myproject_bridge.cpp)
-        Args:
-            model (ModelGraph): the hls4ml model.
-        """
-
-        filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir, '../templates/vivado/myproject_bridge_multigraph.cpp'))
-        fout = open(f"{self.config.get_output_dir()}/{self.config.config['ProjectName']}_bridge.cpp", 'w')
-        model_inputs = self.graphs[0].get_input_variables()
-        model_outputs = self.graphs[-1].get_output_variables()
-        model_brams = [var for var in self.graphs[0].get_weight_variables() if var.storage.lower() == 'bram']
-
-        indent = '    '
-
-        for line in f.readlines():
-            newline = ''
-            if 'MYPROJECT' in line:
-                newline = line.replace('MYPROJECT', format(self.config.config['ProjectName'].upper()))
-            elif 'firmware/myproject' in line:
-                for graph_idx in range(len(self.graphs)):
-                    newline += line.replace('myproject', format(self.graphs[graph_idx].config.config['ProjectName']))
-                    newline += '\n#undef DEFINES_H_\n' if graph_idx < len(self.graphs) - 1 else ''
-            elif 'myproject' in line:
-                newline = line.replace('myproject', format(self.graphs[0].config.config['ProjectName']))
-
-            elif '// hls-fpga-machine-learning insert bram' in line:
-                newline = line
-                for bram in model_brams:
-                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
-
-            elif '// hls-fpga-machine-learning insert header' in line:
-                dtype = line.split('#', 1)[1].strip()
-                inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
-                outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
-
-                newline = ''
-                newline += indent + inputs_str + ',\n'
-                newline += indent + outputs_str + '\n'
-
-            elif '// hls-fpga-machine-learning insert wrapper' in line:
-                dtype = line.split('#', 1)[1].strip()
-                newline = ''
-                for i in model_inputs:
-                    newline += indent + '{var};\n'.format(var=i.definition_cpp(name_suffix='_ap'))
-                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(
-                        dtype, i.type.name, i.size_cpp(), i.name, i.name
-                    )
-                newline += '\n'
-
-                for o in model_outputs:
-                    newline += indent + '{var};\n'.format(var=o.definition_cpp(name_suffix='_ap'))
-
-                newline += '\n'
-
-                input_vars = ','.join([i.name + '_ap' for i in model_inputs])
-                bram_vars = ','.join([b.name for b in model_brams])
-                output_vars = ','.join([o.name + '_ap' for o in model_outputs])
-
-                # Concatenate the input, output, and bram variables. Filter out empty/null values
-                all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
-
-                top_level = indent + f"//{self.config.config['ProjectName']}({all_vars});\n"
-                newline += top_level
-
-                newline += '\n'
-
-                for o in model_outputs:
-                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
-                        o.type.name, dtype, o.size_cpp(), o.name, o.name
-                    )
-
-            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
-                newline = ''
-                for layer in self.graphs[0].get_layers():
-                    func = layer.get_attr('function_cpp', None)
-                    if func and self.graphs[0].config.trace_output and layer.get_attr('trace', False):
-                        vars = layer.get_variables()
-                        for var in vars:
-                            newline += (
-                                indent
-                                + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
-                                + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
-                            )
-
-            elif '// hls-fpga-machine-learning insert namespace' in line:
-                newline = ''
-
-                namespace = self.config.get_writer_config().get('Namespace', None)
-                if namespace is not None:
-                    newline += indent + f'using namespace {namespace};\n'
-
-            else:
-                newline = line
-            fout.write(newline)
-
-        f.close()
-        fout.close()
+        raise NotImplementedError("Trace function has not been implemented yet for MultiModelGraph.")
 
+    def get_input_variables(self):
+        variables = []
+        for inp in self.inputs:
+            variables.append(self.graphs[0].graph[inp].get_output_variable())
+        return variables
+    
+    def get_layers(self):
+        all_values = []
+        for g in self.graphs:
+            all_values.extend(g.graph.values())
+        return dict(zip(all_values, all_values)).values()
+    
     def _get_pragma_details(self, pragma):
         """
         Extracts the pragma type and FIFO depth from the given pragma.
diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
index 0bb2c76c29..8ef0c9cc9d 100644
--- a/hls4ml/templates/vivado/build_lib_multigraph.sh
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -15,19 +15,19 @@ ORIGINAL_PROJECT=myproject
 PROJECT=myproject_stitched
 LIB_STAMP=mystamp
 BASEDIR="$(cd "$(dirname "$0")" && cd .. && pwd)"
-AP_TYPES_PATH="-I${BASEDIR}/${graph_project_names[0]}/firmware/ap_types/"
 INCFLAGS=""
 OUTPUT_DIR="${BASEDIR}/stitched/firmware"
+WEIGHTS_DIR="\"${BASEDIR}/stitched/firmware/weights\""
 
 mkdir -p "${OUTPUT_DIR}"
 
 # Compile all graphs
 OBJECT_FILES=()
 for g in "${graph_project_names[@]}"; do
-    WEIGHTS_DIR="\"${BASEDIR}/${g}/firmware/weights\""
     SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
     OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
-
+    AP_TYPES_PATH="-I${BASEDIR}/${g}/firmware/ap_types/"
+    
     ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
     OBJECT_FILES+=("${OBJ_FILE}")
     INCFLAGS+="-I${BASEDIR}/${g}/ "
diff --git a/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp b/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp
deleted file mode 100644
index edd75a4246..0000000000
--- a/hls4ml/templates/vivado/myproject_bridge_multigraph.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef MYPROJECT_BRIDGE_H_
-#define MYPROJECT_BRIDGE_H_
-
-#include "firmware/myproject.h"
-
-#include "firmware/nnet_utils/nnet_helpers.h"
-#include <algorithm>
-#include <map>
-
-// hls-fpga-machine-learning insert bram
-
-namespace nnet {
-bool trace_enabled = false;
-std::map<std::string, void *> *trace_outputs = NULL;
-size_t trace_type_size = sizeof(double);
-} // namespace nnet
-
-extern "C" {
-
-struct trace_data {
-    const char *name;
-    void *data;
-};
-
-void allocate_trace_storage(size_t element_size) {
-    nnet::trace_enabled = true;
-    nnet::trace_outputs = new std::map<std::string, void *>;
-    nnet::trace_type_size = element_size;
-    // hls-fpga-machine-learning insert trace_outputs
-}
-
-void free_trace_storage() {
-    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
-        void *ptr = i->second;
-        free(ptr);
-    }
-    nnet::trace_outputs->clear();
-    delete nnet::trace_outputs;
-    nnet::trace_outputs = NULL;
-    nnet::trace_enabled = false;
-}
-
-void collect_trace_output(struct trace_data *c_trace_outputs) {
-    int ii = 0;
-    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
-        c_trace_outputs[ii].name = i->first.c_str();
-        c_trace_outputs[ii].data = i->second;
-        ii++;
-    }
-}
-
-// Wrapper of top level function for Python bridge
-void myproject_float(
-    // hls-fpga-machine-learning insert header #float
-) {
-    // hls-fpga-machine-learning insert namespace
-
-    // hls-fpga-machine-learning insert wrapper #float
-}
-
-void myproject_double(
-    // hls-fpga-machine-learning insert header #double
-) {
-    // hls-fpga-machine-learning insert namespace
-
-    // hls-fpga-machine-learning insert wrapper #double
-}
-}
-
-#endif
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index f4fe03271a..7f013253b2 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -450,6 +450,21 @@ def write_weights(self, model):
                     weights, model.config.get_output_dir(), namespace=namespace, write_txt_file=write_txt
                 )
 
+    def write_multigraph_weights(self, model):
+        """Write the weights into header files
+
+        Args:
+            model (MultiModelGraph): the hls4ml multigraph model.
+        """
+        namespace = model.config.get_writer_config().get('Namespace', None)
+        write_txt = model.config.get_writer_config().get('WriteWeightsTxt', True)
+        for g in model.graphs:
+            for layer in g.get_layers():
+                for weights in layer.get_weights():
+                    self.print_array_to_cpp(
+                        weights, model.config.get_output_dir(), namespace=namespace, write_txt_file=write_txt
+                    )
+
     def __make_dat_file(self, original_path, project_path):
         """
         Convert other input/output data types into a dat file, which is
@@ -693,6 +708,132 @@ def write_bridge(self, model):
         f.close()
         fout.close()
 
+    def write_bridge_multigraph(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+        Args:
+            model (MultiModelGraph): the hls4ml multigraph model.
+        """
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../templates/vivado/myproject_bridge.cpp'))
+        fout = open(f"{model.config.get_output_dir()}/{model.config.config['ProjectName']}_bridge.cpp", 'w')
+        model_inputs = model.graphs[0].get_input_variables()
+        model_outputs = model.graphs[-1].get_output_variables()
+        model_brams = [var for var in model.graphs[0].get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in f.readlines():
+            newline = ''
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.config['ProjectName'].upper()))
+            elif 'firmware/myproject' in line:
+                for graph_idx, g in enumerate(model.graphs):
+                    newline += '#undef DEFINES_H_\n'
+                    if len(g.outputs) == 1:
+                        newline += '#define result_t ' + 'result_graph' + str(graph_idx+1) + '_t\n'
+                    newline += line.replace('myproject', format(model.graphs[graph_idx].config.config['ProjectName']))
+                    if len(g.outputs) == 1:
+                        newline += 'typedef result_graph' + str(graph_idx+1) + '_t graph' + str(graph_idx+1) + '_result_t;\n'
+                        newline += '#undef result_t\n\n' if graph_idx < len(model.graphs) - 1 else '\n'
+                newline += '\n'
+            elif 'myproject' in line:
+                newline = line.replace('myproject', format(model.config.config['ProjectName']))
+
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+            elif '// hls-fpga-machine-learning insert header' in line:
+                dtype = line.split('#', 1)[1].strip()
+                inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str + '\n'
+
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
+                dtype = line.split('#', 1)[1].strip()
+                newline = ''
+                for i in model_inputs:
+                    newline += indent + '{var};\n'.format(var=i.definition_cpp(name_suffix='_ap'))
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, {}_ap);\n'.format(
+                        dtype, i.type.name, i.size_cpp(), i.name, i.name
+                    )
+                newline += '\n'
+
+                for idx, g in enumerate(model.graphs):
+                    for o in g.get_output_variables():
+                        definition = o.definition_cpp(name_suffix='_ap')
+                        if len(g.outputs) == 1:
+                            parts = definition.split(' ', 1)  
+                            datatype = 'graph'+str(idx+1) + '_result_t'
+                            if parts[0].startswith('hls::stream'):
+                                modified_definition = 'hls::stream<' + datatype + '> ' + parts[1]
+                            else:
+                                modified_definition = datatype + ' ' + parts[1]
+                            newline += indent + f"{modified_definition};\n"
+                        else:
+                            newline += indent + f"{definition};\n"
+
+                newline += '\n'
+
+                top_level = ''
+                output_vars = ''
+                for idx, g in enumerate(model.graphs):
+                    if idx == 0:
+                        input_vars = ','.join([i.name + '_ap' for i in g.get_input_variables()])
+                    else:
+                        input_vars =  output_vars
+                    bram_vars = ','.join([b.name for b in [var for var in g.get_weight_variables() if var.storage.lower() == 'bram']])
+                    output_vars = ','.join([o.name + '_ap' for o in g.get_output_variables()])
+                    # Concatenate the input, output, and bram variables. Filter out empty/null values
+                    all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+                    top_level += indent + f"{g.config.config['ProjectName']}({all_vars});\n"
+                newline += top_level
+
+                newline += '\n'
+
+                for o in model_outputs:
+                    if len(model.graphs[-1].outputs) == 1:
+                        newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
+                            datatype, dtype, o.size_cpp(), o.name, o.name
+                        )
+                    else:
+                        newline += indent + 'nnet::convert_data<{}, {}, {}>({}_ap, {});\n'.format(
+                            o.type.name, dtype, o.size_cpp(), o.name, o.name
+                        )
+
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                newline = ''
+                for layer in model.get_layers():
+                    func = layer.get_attr('function_cpp', None)
+                    if func and model.config.trace_output and layer.get_attr('trace', False):
+                        vars = layer.get_variables()
+                        for var in vars:
+                            newline += (
+                                indent
+                                + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                            )
+
+            elif '// hls-fpga-machine-learning insert namespace' in line:
+                newline = ''
+
+                namespace = model.config.get_writer_config().get('Namespace', None)
+                if namespace is not None:
+                    newline += indent + f'using namespace {namespace};\n'
+
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+
+
     def write_build_script(self, model):
         """Write the TCL/Shell build scripts (project.tcl, build_prj.tcl, vivado_synth.tcl, build_lib.sh)
 
@@ -740,6 +881,26 @@ def write_build_script(self, model):
 
                 dst.write(line)
         build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC)
+    
+    def write_build_script_multigraph(self, model):
+            """Write the build script (build_lib.sh) for stitched multigraph project
+            Args:
+                model (MultiModelGraph): the hls4ml multigraph model.
+            """
+            filedir = Path(__file__).parent
+            os.makedirs(model.config.get_output_dir(), exist_ok=True)
+            build_lib_src = (filedir / '../templates/vivado/build_lib_multigraph.sh').resolve()
+            build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve()
+            graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in model.graphs)
+
+            with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
+                for line in src.readlines():
+                    line = line.replace('myproject', model.config.config['OriginalProjectName'])
+                    line = line.replace('myproject_stitched', model.config.config['ProjectName'])
+                    line = line.replace('mystamp', model.config.config['Stamp'])
+                    line = line.replace('mygraph_name_list', graph_project_names)
+                    dst.write(line)
+            os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
 
     def write_nnet_utils(self, model):
         """Copy the nnet_utils, AP types headers and any custom source to the project output directory
@@ -847,19 +1008,27 @@ def write_tar(self, model):
             with tarfile.open(tar_path, mode='w:gz') as archive:
                 archive.add(model.config.get_output_dir(), recursive=True, arcname='')
 
-    def write_hls(self, model):
-        print('Writing HLS project')
-        self.write_project_dir(model)
-        self.write_project_cpp(model)
-        self.write_project_header(model)
-        self.write_weights(model)
-        self.write_defines(model)
-        self.write_parameters(model)
-        self.write_test_bench(model)
-        self.write_bridge(model)
-        self.write_build_script(model)
-        self.write_nnet_utils(model)
-        self.write_generated_code(model)
-        self.write_yml(model)
-        self.write_tar(model)
-        print('Done')
+    def write_hls(self, model, is_multigraph=False):
+        if not is_multigraph:
+            print('Writing HLS project')
+            self.write_project_dir(model)
+            self.write_project_cpp(model)
+            self.write_project_header(model)
+            self.write_weights(model)
+            self.write_defines(model)
+            self.write_parameters(model)
+            self.write_test_bench(model)
+            self.write_bridge(model)
+            self.write_build_script(model)
+            self.write_nnet_utils(model)
+            self.write_generated_code(model)
+            self.write_yml(model)
+            self.write_tar(model)
+            print('Done')
+        else:
+            print('Writing HLS multigraph project')
+            self.write_project_dir(model)
+            self.write_build_script_multigraph(model)
+            self.write_bridge_multigraph(model)
+            self.write_multigraph_weights(model)
+            print('Done')   

From 7fbf439febd184d8f2cabd5cb53215ed28f29e02 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 3 Mar 2025 16:57:23 +0100
Subject: [PATCH 42/50] Add pytest for multi-graph and fix minor issues

---
 docs/ir/multimodelgraph.rst            | 13 +++-
 hls4ml/backends/vitis/vitis_backend.py |  2 +-
 hls4ml/model/graph.py                  | 25 ++++++-
 hls4ml/utils/simulation_utils.py       | 63 +++--------------
 test/pytest/test_multi_graph.py        | 94 ++++++++++++++++++++++++++
 5 files changed, 137 insertions(+), 60 deletions(-)
 create mode 100644 test/pytest/test_multi_graph.py

diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
index 4cb0bfc726..abab7f43a7 100644
--- a/docs/ir/multimodelgraph.rst
+++ b/docs/ir/multimodelgraph.rst
@@ -68,7 +68,7 @@ This allows modular design flows and easier debugging of large models.
 ``compile`` method
 ==================
 
-Compiles all the individual ``ModelGraph`` subgraphs within the ``MultiModelGraph``.
+Compiles all the individual ``ModelGraph`` subgraphs within the ``MultiModelGraph``. Also, compiles a chained bridge file with all the subgraphs linked together that can be used for the predict function.
 
 .. code-block:: python
 
@@ -97,7 +97,7 @@ The returned ``report`` contains data from each subgraph's build and, if stitchi
 ``predict`` method
 ==================
 
-Performs a forward pass through the chained sub-models using the C-simulation (``sim='csim'``). Data is automatically passed from one subgraph's output to the next subgraph's input. For large stitched designs, you can also leverage RTL simulation (``sim='rtl'``) to perform the forward pass at the register-transfer level. In this case, a Verilog testbench is dynamically generated and executed against the stitched IP design, providing behavioral simulation to accurately verify latency and output at the hardware level.
+Performs a forward pass through the chained bridge file using the C-simulation (``sim='csim'``). Data is automatically passed from one subgraph's output to the next subgraph's input. For large stitched designs, you can also leverage RTL simulation (``sim='rtl'``) to perform the forward pass at the register-transfer level. In this case, a Verilog testbench is dynamically generated and executed against the stitched IP design, providing behavioral simulation to accurately verify latency and output at the hardware level. Note that the input data for the RTL simulation must have a single batch dimension.
 
 .. code-block:: python
 
@@ -126,3 +126,12 @@ Summary
 --------------------------
 
 The ``MultiModelGraph`` class is a tool for modular hardware design. By splitting a large neural network into multiple subgraphs, building each independently, and then stitching them together, you gain flexibility, parallelism, and facilitate hierarchical design, incremental optimization, and integrated system-level simulations.
+
+--------------------------
+Other Notes
+--------------------------
+
+* Branch Splitting Limitation: Splitting in the middle of a branched architecture (e.g., ResNet skip connections or multi-path networks) is currently unsupported. Also, each split subgraph must have a single input and a single output.
+* Handling Multiple NN Inputs & Outputs: The final NN output can support multiple output layers. However, for networks with multiple input layers, proper synchronization is required to drive inputs—especially for stream interfaces. A fork-join mechanism in the Verilog testbench can help manage input synchronization effectively.
+* RTL Simulation Issue: RTL simulation of stitched IPs with io_type='io_parallel' and a split at the flatten layer leads to improper simulation behavior and should be avoided.
+* Array Partitioning for Parallel I/O: For io_parallel interfaces, all IPs must use the 'partition' pragma instead of 'reshape'.
\ No newline at end of file
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index d0913e78c1..5a312da08a 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -223,7 +223,7 @@ def build_stitched_design(
             stitched_report = aggregate_graph_reports(graph_reports)
 
         if sim_stitched_design:
-            testbench_output = read_testbench_log(testbench_log_path)
+            testbench_output = read_testbench_log(testbench_log_path, nn_config['outputs'])
             stitched_report['BehavSimResults'] = testbench_output['BehavSimResults']
             stitched_report['StitchedDesignReport']['BestLatency'] = testbench_output['BestLatency']
             stitched_report['StitchedDesignReport']['WorstLatency'] = testbench_output['WorstLatency']
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4c8cdb6677..4b81462cc8 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1,6 +1,7 @@
 import concurrent.futures
 import copy
 import ctypes
+import uuid
 import importlib.util
 import os
 import platform
@@ -1020,6 +1021,7 @@ def __init__(self, graphs):
         self._initialize_config(graphs[0])
         self._bind_modelgraph_methods()
         self._initialize_io_attributes(graphs)
+        self._update_pragmas()
 
     def _initialize_config(self, first_graph):
         self.config = copy.copy(first_graph.config)
@@ -1055,7 +1057,7 @@ def _update_project_config(self, first_graph):
         original_output_dir = first_graph.config.get_output_dir().partition('/graph')[0]
         self.config.config['OutputDir'] = os.path.join(original_output_dir, 'stitched')
         self.config.config['StitchedProjectName'] = 'vivado_stitched_design'
-        self.config.config['Stamp'] = '64616e'
+        self.config.config['Stamp'] = self._make_stamp()
 
     def __getitem__(self, index):
         return self.graphs[index]
@@ -1223,6 +1225,20 @@ def _print_status(self, status):
         status_str = ' | '.join(f'{proj}: {status_icons.get(stat, "?")}' for proj, stat in status.items())
         print(status_str, flush=True)
 
+    def _update_pragmas(self):
+        """
+        Modifies the pragma for all layers in all graphs, replacing 'reshape' with 'partition' where applicable
+        """
+        for g in self.graphs:
+            for layer_name in g.output_vars:
+                if hasattr(g.output_vars[layer_name], 'pragma'):
+                    layer_pragma = g.output_vars[layer_name].pragma
+                    if isinstance(layer_pragma, str) and layer_pragma == 'reshape':
+                        g.output_vars[layer_name].pragma = 'partition'
+                        print(f"Updating pragma in Layer '{layer_name}' from 'reshape' to 'partition'.")
+                else:
+                    print(f"Layer '{layer_name}' does not have a 'pragma' attribute.")
+
     def _assert_consistent_pragmas(self):
         """
         Ensure all graphs have the same pragma in their input and output layers.
@@ -1251,7 +1267,12 @@ def _assert_consistent_pragmas(self):
                 raise ValueError(
                     f"Pragma mismatch in graph {idx}:\n" f"Expected: {ref_pragmas}\n" f"Found: {current_pragmas}"
                 )
-
+            
+    def _make_stamp(self):
+            length = 8
+            stamp = uuid.uuid4()
+            return str(stamp)[-length:]
+    
     def _replace_logos(self):
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index e9a4429d9e..e6d5552830 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -5,57 +5,6 @@
 import pandas as pd
 from lxml import etree
 
-
-def parse_component_xml(component_xml_path):
-    """
-    Parse the given component.xml file and return structured information
-    about the input and output ports.
-
-    Returns:
-        inputs (list): A list of dicts, each containing 'name', 'direction', and 'width' for input ports.
-        outputs (list): A list of dicts, each containing 'name', 'direction', and 'width' for output ports.
-    """
-    if not os.path.exists(component_xml_path):
-        raise FileNotFoundError(f"component.xml not found at {component_xml_path}")
-
-    # Parse the XML file
-    tree = etree.parse(component_xml_path)
-    root = tree.getroot()
-
-    # Define the namespaces
-    ns = {
-        'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
-        'xilinx': 'http://www.xilinx.com',
-        'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
-    }
-
-    # Extract ports
-    ports = root.findall('.//spirit:model/spirit:ports/spirit:port', namespaces=ns)
-    inputs = []
-    outputs = []
-
-    for port in ports:
-        name = port.find('spirit:name', namespaces=ns).text
-        wire = port.find('spirit:wire', namespaces=ns)
-        if wire is not None:
-            direction = wire.find('spirit:direction', namespaces=ns).text
-            vector = wire.find('spirit:vector', namespaces=ns)
-            if vector is not None:
-                left = vector.find('spirit:left', namespaces=ns).text
-                right = vector.find('spirit:right', namespaces=ns).text
-                width = abs(int(left) - int(right)) + 1
-            else:
-                width = 1
-
-            port_info = {'name': name, 'direction': direction, 'width': width}
-            if direction == 'in':
-                inputs.append(port_info)
-            elif direction == 'out':
-                outputs.append(port_info)
-
-    return inputs, outputs
-
-
 def write_verilog_testbench(nn_config, testbench_output_path):
     """
     Generate a Verilog testbench for a given neural network configuration.
@@ -552,8 +501,7 @@ def prepare_testbench_input(data, fifo_depth, batch_size):
     data_reshaped = data_arr.reshape((fifo_depth, batch_size))
     return data_reshaped
 
-
-def read_testbench_log(testbench_log_path):
+def read_testbench_log(testbench_log_path, outputs):
     """
     Reads the testbench log file and returns a dictionary
     """
@@ -569,8 +517,13 @@ def read_testbench_log(testbench_log_path):
 
         sim_dict = {'BestLatency': int(BestLatency), 'WorstLatency': int(WorstLatency), 'BehavSimResults': []}
 
-        grouped = output_df.groupby('output_name')
-        for name, group in grouped:
+        ordered_output_names = [entry['name'] for entry in outputs]
+        for name in ordered_output_names:
+            group = output_df[output_df['output_name'] == name]
+            if group.empty:
+                print(f"Warning: Expected output '{name}' not found in testbench log.")
+                continue
+
             indices = group['index'].astype(int)
             values = group['value'].astype(float)
             array = np.zeros(max(indices) + 1, dtype=np.float64)
diff --git a/test/pytest/test_multi_graph.py b/test/pytest/test_multi_graph.py
new file mode 100644
index 0000000000..2f361c4709
--- /dev/null
+++ b/test/pytest/test_multi_graph.py
@@ -0,0 +1,94 @@
+from pathlib import Path
+import numpy as np
+import pytest
+import tensorflow as tf
+from tensorflow.keras.layers import Input, Conv2D, Activation, MaxPooling2D, Flatten, Dense
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+def create_test_model():
+    """
+    This architecture ensures testing of corner cases such as:
+    double layer outputs and variety of layers to serve as spliting points.
+    """
+    inp = Input(shape=(4, 4, 3), name='input_layer')
+    x = Conv2D(4, (3, 3), padding='same', name='conv1')(inp)
+    x = Activation('relu', name='relu1')(x)
+    x = MaxPooling2D((2, 2), name='pool1')(x)
+    x = Flatten(name='flatten')(x)
+    x = Dense(16, activation='relu', name='dense_common')(x)
+    output1 = Dense(5, activation='relu', name='dense1')(x)
+    output2 = Dense(5, activation='relu', name='dense2')(x)
+    model = tf.keras.Model(inputs=inp, outputs=[output1, output2])
+    
+    return model
+
+@pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('granularity', ['model', 'name'])
+@pytest.mark.parametrize('split_layers', [
+    ('pool1', 'dense_common'),
+    ('relu1', 'flatten')
+])
+def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
+    """
+    Tests the multi-graph splitting and stitching process.
+    - Verifies that predictions from the monolithic and multi-graph versions match with the CSimulation.
+    - When granularity='name', an additional HLS build and stitched RTL simulation step is performed.
+    - The RTL simulation outputs are compared against the predicted values from CSimulation.
+    """
+    backend = 'vitis'
+    model = create_test_model()
+    model.compile(optimizer='adam', loss='categorical_crossentropy')
+    X_input = np.random.rand(5, 4, 4, 3).astype(np.float32)
+    keras_pred = model.predict(X_input)
+
+    config = hls4ml.utils.config_from_keras_model(model, granularity=granularity, default_precision='ap_fixed<32,16>')
+    config['Model']['Strategy'] = strategy
+
+    output_dir_mono = str(test_root_path / f"hls4mlprj_mono_{granularity}_{'_'.join(split_layers)}_{io_type}_{strategy}")
+    output_dir_multi = str(test_root_path / f"hls4mlprj_multi_{granularity}_{'_'.join(split_layers)}_{io_type}_{strategy}")
+
+    # --- Monolithic HLS conversion (no split) ---
+    hls_model_mono = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir=output_dir_mono,
+        backend=backend,
+        io_type=io_type
+    )
+    hls_model_mono.compile()
+    pred_mono = hls_model_mono.predict(X_input)
+
+    # --- Multi-model conversion with split ---
+    hls_model_multi = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir=output_dir_multi,
+        backend=backend,
+        io_type=io_type,
+        split_layer_names=list(split_layers)
+    )
+    hls_model_multi.compile()
+    pred_multi = hls_model_multi.predict(X_input)
+
+    assert hasattr(hls_model_multi, 'graphs'), "Multi-model graph missing 'graphs' attribute."
+    assert len(hls_model_multi.graphs) == 3, f"Expected 3 subgraphs, got {len(hls_model_multi.graphs)}"
+
+    for mono_out, multi_out in zip(pred_mono, pred_multi):
+        np.testing.assert_allclose(multi_out, mono_out, rtol=0, atol=1e-5)
+    
+    if granularity == 'name':
+        if io_type == 'io_parallel' and split_layers == ('relu1', 'flatten'):
+            pytest.skip("Skipping RTL simulation for io_parallel with split layer at flatten due to improper simulation behavior.")
+
+        # --- Optional: Build the HLS project and run simulation ---
+        hls_model_multi.build(csim=False, cosim=False, vsynth=False, export=True, 
+                        stitch_design=True, sim_stitched_design=True, export_stitched_design=True)
+
+        # test only the first sample, as batch prediction is not supported for stitched RTL simulations
+        inp = np.expand_dims(X_input[0], axis=0)
+        sim_results = hls_model_multi.predict(inp, sim = 'rtl')
+        for sim_out, pred_out in zip(sim_results, list([pred_multi[0][0], pred_multi[1][0]])):
+            np.testing.assert_allclose(sim_out, pred_out, rtol=0, atol=0.3)

From ba8613246e29e2ae4bfd2030f75a5bd1a9811774 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Tue, 4 Mar 2025 16:17:42 +0100
Subject: [PATCH 43/50] pre-commit fixes

---
 docs/ir/multimodelgraph.rst                   |   2 +-
 hls4ml/backends/vitis/vitis_backend.py        |  13 +-
 hls4ml/converters/keras_to_hls.py             |   2 +-
 hls4ml/model/graph.py                         |  58 +++---
 hls4ml/report/__init__.py                     |   2 +-
 .../templates/vivado/build_lib_multigraph.sh  |   2 +-
 hls4ml/templates/vivado/ip_stitcher.tcl       |   2 +-
 hls4ml/utils/simulation_utils.py              | 190 +++++++++---------
 hls4ml/writer/vivado_writer.py                |  57 +++---
 test/pytest/test_multi_graph.py               |  41 ++--
 10 files changed, 194 insertions(+), 175 deletions(-)

diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
index abab7f43a7..35a4dfe78d 100644
--- a/docs/ir/multimodelgraph.rst
+++ b/docs/ir/multimodelgraph.rst
@@ -134,4 +134,4 @@ Other Notes
 * Branch Splitting Limitation: Splitting in the middle of a branched architecture (e.g., ResNet skip connections or multi-path networks) is currently unsupported. Also, each split subgraph must have a single input and a single output.
 * Handling Multiple NN Inputs & Outputs: The final NN output can support multiple output layers. However, for networks with multiple input layers, proper synchronization is required to drive inputs—especially for stream interfaces. A fork-join mechanism in the Verilog testbench can help manage input synchronization effectively.
 * RTL Simulation Issue: RTL simulation of stitched IPs with io_type='io_parallel' and a split at the flatten layer leads to improper simulation behavior and should be avoided.
-* Array Partitioning for Parallel I/O: For io_parallel interfaces, all IPs must use the 'partition' pragma instead of 'reshape'.
\ No newline at end of file
+* Array Partitioning for Parallel I/O: For io_parallel interfaces, all IPs must use the 'partition' pragma instead of 'reshape'.
diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 5a312da08a..a0c2207c96 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -115,7 +115,16 @@ def build(
         build_command = (
             'vitis_hls -f build_prj.tcl "reset={reset} csim={csim} synth={synth} cosim={cosim} '
             'validation={validation} export={export} vsynth={vsynth} fifo_opt={fifo_opt}"'
-        ).format(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth, fifo_opt=fifo_opt)
+        ).format(
+            reset=reset,
+            csim=csim,
+            synth=synth,
+            cosim=cosim,
+            validation=validation,
+            export=export,
+            vsynth=vsynth,
+            fifo_opt=fifo_opt,
+        )
 
         output_dir = model.config.get_output_dir()
         stdout_log = os.path.join(output_dir, 'build_stdout.log')
@@ -207,7 +216,7 @@ def build_stitched_design(
             f'export_design={int(export_stitched_design)}',
             f"stitch_project_name={nn_config['StitchedProjectName']}",
             f"original_project_name={nn_config['OriginalProjectName']}",
-            f'sim_verilog_file=testbench.v',
+            'sim_verilog_file=testbench.v',
         ]
 
         with open(stdout_log, 'w') as stdout_file, open(stderr_log, 'w') as stderr_file:
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 7a6bd9de28..b0f1964331 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -330,7 +330,7 @@ def keras_to_hls(config, split_layer_names=None):
     merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     if split_layer_names:
         if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
-            raise ValueError(f'Split layer must not be a merge layer')
+            raise ValueError('Split layer must not be a merge layer')
         hls_model = ModelGraph.make_multi_graph(
             config, layer_list, input_layers, output_layers, output_shapes, split_layer_names
         )
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4b81462cc8..f67bc89208 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1,15 +1,12 @@
 import concurrent.futures
 import copy
 import ctypes
-import uuid
 import importlib.util
 import os
 import platform
-import re
 import shutil
-import stat
 import threading
-import warnings
+import uuid
 from collections import OrderedDict
 
 import numpy as np
@@ -982,7 +979,9 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
                     if previous_layer_name in sub_config['HLSConfig']['LayerName']:
                         prev_layer_config = sub_config['HLSConfig']['LayerName'][previous_layer_name]
                         new_layer_config = {}
-                        new_layer_config['Precision'] =  last_output_precision if last_output_precision is not None else 'auto'
+                        new_layer_config['Precision'] = (
+                            last_output_precision if last_output_precision is not None else 'auto'
+                        )
                         # NOTE - We copy Trace as well but it might be better to reset it
                         new_layer_config['Trace'] = prev_layer_config['Trace']
                         sub_config['HLSConfig']['LayerName'][input_layer_name] = new_layer_config
@@ -1001,7 +1000,11 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
             if hls_model.graph:
                 last_layer = next(reversed(hls_model.graph.values()))
                 last_prec = last_layer.attributes.get('result_t')
-                last_output_precision = (last_prec.precision if hasattr(last_prec, 'precision') else last_prec) if last_prec is not None else 'auto'
+                last_output_precision = (
+                    (last_prec.precision if hasattr(last_prec, 'precision') else last_prec)
+                    if last_prec is not None
+                    else 'auto'
+                )
                 if last_output_precision == 'auto' or last_output_precision is None:
                     raise ValueError("Could not extract a valid precision from the last layer!")
 
@@ -1028,7 +1031,7 @@ def _initialize_config(self, first_graph):
         # Deep copy only 'ProjectName' and 'OutputDir', shallow copy others
         keys_to_deepcopy = ['ProjectName', 'OutputDir']
         self.config.config = {
-            k: copy.deepcopy(first_graph.config.config[k]) if k in keys_to_deepcopy else first_graph.config.config[k] 
+            k: copy.deepcopy(first_graph.config.config[k]) if k in keys_to_deepcopy else first_graph.config.config[k]
             for k in first_graph.config.config
         }
         self._update_project_config(first_graph)
@@ -1048,7 +1051,7 @@ def _initialize_io_attributes(self, graphs):
         self._top_function_lib = None
         self.inputs = graphs[0].inputs
         self.outputs = graphs[-1].outputs
-        self.output_vars = graphs[-1].output_vars        
+        self.output_vars = graphs[-1].output_vars
 
     def _update_project_config(self, first_graph):
         original_project_name = first_graph.config.get_project_name().partition('_graph')[0]
@@ -1077,18 +1080,18 @@ def parse_nn_config(self):
                     pragma = graph.output_vars[layer].pragma
                     layer_pragma, fifo_depth = self._get_pragma_details(pragma)
                     if total_bits % fifo_depth != 0:
-                        raise ValueError(f"Division of total_bits by fifo_depth does not result in a remainder of zero.")
+                        raise ValueError('Division of total_bits by fifo_depth does not result in a remainder of zero.')
                     batch_size = total_bits // fifo_depth
                     precision = graph.output_vars[layer].type.precision
                     nn_config[io_type].append(
                         {
-                            "name": graph.output_vars[layer].name,
-                            "pragma": layer_pragma,
-                            "integer_bits": int(precision.integer),
-                            "fractional_bits": int(precision.fractional),
-                            "signed": int(precision.signed),
-                            "fifo_depth": int(fifo_depth),
-                            "batch_size": int(batch_size),
+                            'name': graph.output_vars[layer].name,
+                            'pragma': layer_pragma,
+                            'integer_bits': int(precision.integer),
+                            'fractional_bits': int(precision.fractional),
+                            'signed': int(precision.signed),
+                            'fifo_depth': int(fifo_depth),
+                            'batch_size': int(batch_size),
                         }
                     )
 
@@ -1112,7 +1115,7 @@ def build(
         status = {}
         status_lock = threading.Lock()
 
-        for idx, g in enumerate(self.graphs, start=1):
+        for idx, _ in enumerate(self.graphs, start=1):
             status[f'graph{idx}'] = 'Pending'
 
         def build_wrapper(idx, g, **kwargs):
@@ -1130,7 +1133,7 @@ def build_wrapper(idx, g, **kwargs):
                 with status_lock:
                     status[graph_name] = 'Failed'
                     self._print_status(status)
-                raise
+                raise exc
 
         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
             future_to_idx = {
@@ -1144,6 +1147,7 @@ def build_wrapper(idx, g, **kwargs):
                     build_results[graph_name] = result
                 except Exception as exc:
                     build_results[graph_name] = None
+                    print(f"Error while building {graph_name}: {exc}")
 
         self.graph_reports = build_results
         self._replace_logos()
@@ -1188,7 +1192,7 @@ def predict(self, x, sim='csim'):
             return stitched_report['BehavSimResults']
         else:
             print('Unknown simulation option given.')
-    
+
     def trace(self, x):
         raise NotImplementedError("Trace function has not been implemented yet for MultiModelGraph.")
 
@@ -1197,13 +1201,13 @@ def get_input_variables(self):
         for inp in self.inputs:
             variables.append(self.graphs[0].graph[inp].get_output_variable())
         return variables
-    
+
     def get_layers(self):
         all_values = []
         for g in self.graphs:
             all_values.extend(g.graph.values())
         return dict(zip(all_values, all_values)).values()
-    
+
     def _get_pragma_details(self, pragma):
         """
         Extracts the pragma type and FIFO depth from the given pragma.
@@ -1236,8 +1240,6 @@ def _update_pragmas(self):
                     if isinstance(layer_pragma, str) and layer_pragma == 'reshape':
                         g.output_vars[layer_name].pragma = 'partition'
                         print(f"Updating pragma in Layer '{layer_name}' from 'reshape' to 'partition'.")
-                else:
-                    print(f"Layer '{layer_name}' does not have a 'pragma' attribute.")
 
     def _assert_consistent_pragmas(self):
         """
@@ -1267,12 +1269,12 @@ def _assert_consistent_pragmas(self):
                 raise ValueError(
                     f"Pragma mismatch in graph {idx}:\n" f"Expected: {ref_pragmas}\n" f"Found: {current_pragmas}"
                 )
-            
+
     def _make_stamp(self):
-            length = 8
-            stamp = uuid.uuid4()
-            return str(stamp)[-length:]
-    
+        length = 8
+        stamp = uuid.uuid4()
+        return str(stamp)[-length:]
+
     def _replace_logos(self):
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index 88c21a2289..6973a77dd8 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -3,7 +3,7 @@
 from hls4ml.report.catapult_report import read_catapult_report  # noqa: F401
 from hls4ml.report.quartus_report import parse_quartus_report  # noqa: F401
 from hls4ml.report.quartus_report import read_quartus_report  # noqa: F401
+from hls4ml.report.vivado_report import aggregate_graph_reports  # noqa: F401
 from hls4ml.report.vivado_report import parse_vivado_report  # noqa: F401
 from hls4ml.report.vivado_report import print_vivado_report  # noqa: F401
 from hls4ml.report.vivado_report import read_vivado_report  # noqa: F401
-from hls4ml.report.vivado_report import aggregate_graph_reports
diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
index 8ef0c9cc9d..8a7c13dbc1 100644
--- a/hls4ml/templates/vivado/build_lib_multigraph.sh
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -27,7 +27,7 @@ for g in "${graph_project_names[@]}"; do
     SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
     OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
     AP_TYPES_PATH="-I${BASEDIR}/${g}/firmware/ap_types/"
-    
+
     ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
     OBJECT_FILES+=("${OBJ_FILE}")
     INCFLAGS+="-I${BASEDIR}/${g}/ "
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index db19e27046..71146884bb 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -162,7 +162,7 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
 
             # Set the CONFIG.POLARITY property of the 'ap_rst' port based on the retrieved polarity
             set_property CONFIG.POLARITY $rst_polarity $ap_rst_port
-            
+
             # Rename the port based on polarity
             if {$rst_polarity eq "ACTIVE_LOW"} {
                 set rst_port_name "ap_rst_n"
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index e6d5552830..d9f610d932 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -1,9 +1,8 @@
-import json
 import os
 
 import numpy as np
 import pandas as pd
-from lxml import etree
+
 
 def write_verilog_testbench(nn_config, testbench_output_path):
     """
@@ -15,41 +14,39 @@ def write_verilog_testbench(nn_config, testbench_output_path):
       - Data capture and logging for outputs
       - Latency measurement
     """
-    inputs = nn_config['inputs']
-    outputs = nn_config['outputs']
     pragma = nn_config['inputs'][0]['pragma']
     # NOTE we usually have active-low in stream interfaces and active-high in partitioned interfaces.
     rst_name = 'ap_rst_n' if pragma == 'stream' else 'ap_rst'
 
     with open(testbench_output_path, 'w') as f:
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Header and Module Declaration
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('`timescale 1ns / 1ps\n\n')
         f.write('module tb_design_1_wrapper;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Clock and Reset Signals
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock and Reset Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg ap_clk;\n')
         f.write(f'    reg {rst_name};\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Control and Handshaking Signals
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Control and Handshaking Signals\n')
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    reg  ap_start;\n')
         f.write('    wire ap_done;\n\n')
 
-        if(pragma == 'stream'):
-            #----------------------------------------------------------------------
+        if pragma == 'stream':
+            # ----------------------------------------------------------------------
             # AXI4-Stream Input Interfaces
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             f.write('    //------------------------------------------------------------------------\n')
             f.write('    // AXI4-Stream Input Interfaces\n')
             f.write('    //------------------------------------------------------------------------\n')
@@ -62,9 +59,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 f.write(f'    reg  {name}_tvalid;\n')
                 f.write(f'    wire {name}_tready;\n\n')
 
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             # AXI4-Stream Output Interfaces
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             f.write('    //------------------------------------------------------------------------\n')
             f.write('    // AXI4-Stream Output Interfaces\n')
             f.write('    //------------------------------------------------------------------------\n')
@@ -77,9 +74,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 f.write(f'    wire {name}_tvalid;\n')
                 f.write(f'    reg  {name}_tready;\n\n')
         else:
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             # Partitioned Input Interfaces
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             f.write('    //------------------------------------------------------------------------\n')
             f.write('    // Partitioned Input Interfaces\n')
             f.write('    //------------------------------------------------------------------------\n')
@@ -92,9 +89,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                     f.write(f'    reg  [{total_bits - 1}:0] {name}_{idx};\n')
                     f.write(f'    reg {name}_{idx}_ap_vld;\n')
 
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             # Partitioned Output Interfaces
-            #----------------------------------------------------------------------
+            # ----------------------------------------------------------------------
             f.write('    //------------------------------------------------------------------------\n')
             f.write('    // Partitioned Output Interfaces\n')
             f.write('    //------------------------------------------------------------------------\n')
@@ -107,9 +104,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                     f.write(f'    wire [{total_bits - 1}:0] {name}_{idx};\n')
                     f.write(f'    wire {name}_{idx}_ap_vld;\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # DUT Instantiation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // DUT Instantiation\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -123,7 +120,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         for layer in nn_config['inputs']:
             name = layer["name"]
             batch_size = layer['batch_size']
-            if (pragma == 'stream'):
+            if pragma == 'stream':
                 f.write(f'        .{name}_tdata({name}_tdata),\n')
                 f.write(f'        .{name}_tready({name}_tready),\n')
                 f.write(f'        .{name}_tvalid({name}_tvalid),\n')
@@ -136,7 +133,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         for layer in nn_config['outputs'][:-1]:
             name = layer["name"]
             batch_size = layer['batch_size']
-            if (pragma == 'stream'):
+            if pragma == 'stream':
                 f.write(f'        .{name}_tdata({name}_tdata),\n')
                 f.write(f'        .{name}_tready({name}_tready),\n')
                 f.write(f'        .{name}_tvalid({name}_tvalid),\n')
@@ -149,22 +146,22 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         last_output_layer = nn_config['outputs'][-1]
         name = last_output_layer["name"]
         batch_size = last_output_layer['batch_size']
-        if (pragma == 'stream'):
+        if pragma == 'stream':
             f.write(f'        .{name}_tdata({name}_tdata),\n')
             f.write(f'        .{name}_tready({name}_tready),\n')
             f.write(f'        .{name}_tvalid({name}_tvalid)\n')
         else:
             for idx in range(batch_size):
-                    f.write(f'        .{name}_{idx}({name}_{idx}),\n')
-                    if idx < batch_size - 1:
-                        f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld),\n')
-                    else:
-                        f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld)\n')
+                f.write(f'        .{name}_{idx}({name}_{idx}),\n')
+                if idx < batch_size - 1:
+                    f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld),\n')
+                else:
+                    f.write(f'        .{name}_{idx}_ap_vld({name}_{idx}_ap_vld)\n')
         f.write('    );\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Clock Generation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Clock Generation (100 MHz => 10 ns period)\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -173,9 +170,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        forever #5 ap_clk = ~ap_clk;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Reset Generation
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Reset Generation\n')
         f.write('    // Wait for a cycle and then release reset.\n')
@@ -188,12 +185,12 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         else:
             f.write(f'        {rst_name} = 1;\n')
             f.write('        repeat (1) @(posedge ap_clk);\n')
-            f.write(f'        {rst_name} = 0;\n')           
+            f.write(f'        {rst_name} = 0;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Signal Initialization
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Signal Initialization\n')
         f.write('    // Initialize control signals, input valid, and output ready.\n')
@@ -202,22 +199,22 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('        ap_start = 0;\n')
 
         for layer in nn_config['inputs']:
-                name = layer['name']
-                batch_size = layer['batch_size']
-                if pragma == 'stream':
-                    f.write(f'        {name}_tvalid = 0;\n')
-                else:
-                    for idx in range(batch_size):
-                        f.write(f'        {name}_{idx}_ap_vld = 0;\n')
+            name = layer['name']
+            batch_size = layer['batch_size']
+            if pragma == 'stream':
+                f.write(f'        {name}_tvalid = 0;\n')
+            else:
+                for idx in range(batch_size):
+                    f.write(f'        {name}_{idx}_ap_vld = 0;\n')
         if pragma == 'stream':
             for layer in nn_config['outputs']:
                 name = layer['name']
                 f.write(f'        {name}_tready = 1;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Variables for Logging and Measurement
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Logging and Measurement Variables\n')
         f.write('    //------------------------------------------------------------------------\n')
@@ -231,9 +228,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('    reg [1:0] done_counter = 0;\n')
         f.write('    reg old_ap_done = 0;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Cycle Counting
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Cycle Counting\n')
         f.write('    // Count cycles to measure latency.\n')
@@ -244,14 +241,14 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write('            cycle_count <= 0;\n')
         else:
             f.write(f'        if ({rst_name})\n')
-            f.write('            cycle_count <= 0;\n')           
+            f.write('            cycle_count <= 0;\n')
         f.write('        else\n')
         f.write('            cycle_count <= cycle_count + 1;\n')
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Data Transmission (Stimulus Generation)
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Data Transmission (Stimulus)\n')
         f.write('    // Send input patterns to the DUT.\n')
@@ -261,7 +258,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         if rst_name == 'ap_rst_n':
             f.write(f'        wait ({rst_name} == 1);\n')
         else:
-            f.write(f'        wait ({rst_name} == 0);\n')            
+            f.write(f'        wait ({rst_name} == 0);\n')
 
         f.write('        // Open CSV log file\n')
         f.write('        csv_file = $fopen("../../../../testbench_log.csv", "w");\n')
@@ -275,9 +272,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             f.write('        // Start the DUT\n')
             f.write('        ap_start = 1;\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Sending first pattern of inputs (all zeroes)
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         for layer in nn_config['inputs']:
             i_bits = layer["integer_bits"]
             f_bits = layer["fractional_bits"]
@@ -303,26 +300,26 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             if pragma == 'stream':
                 f.write(f'        {name}_tvalid = 0;\n\n')
             else:
-                f.write(f'        // Assert valid signals\n')
+                f.write('        // Assert valid signals\n')
                 for k in range(batch_size):
-                   f.write(f'        {name}_{k}_ap_vld = 1;\n')
+                    f.write(f'        {name}_{k}_ap_vld = 1;\n')
                 f.write('        // Start the DUT\n')
-                f.write(f'        ap_start = 1;\n')  
-                f.write(f'        @(posedge ap_clk);\n')
-                f.write(f'        ap_start = 0;\n') 
-                f.write(f'        // Deassert valid signals\n')
+                f.write('        ap_start = 1;\n')
+                f.write('        @(posedge ap_clk);\n')
+                f.write('        ap_start = 0;\n')
+                f.write('        // Deassert valid signals\n')
                 for k in range(batch_size):
-                   f.write(f'        {name}_{k}_ap_vld = 0;\n') 
-                f.write(f'\n') 
-                f.write(f'        // Wait for ap_done to go high\n')
-                f.write(f'        wait (ap_done);\n')
-                f.write(f'        // Wait for ap_done to go low before sending next input\n')
-                f.write(f'        wait (!ap_done);\n')
-                f.write(f'        // Wait for ap_done to go high\n')
-
-        #----------------------------------------------------------------------
+                    f.write(f'        {name}_{k}_ap_vld = 0;\n')
+                f.write('\n')
+                f.write('        // Wait for ap_done to go high\n')
+                f.write('        wait (ap_done);\n')
+                f.write('        // Wait for ap_done to go low before sending next input\n')
+                f.write('        wait (!ap_done);\n')
+                f.write('        // Wait for ap_done to go high\n')
+
+        # ----------------------------------------------------------------------
         # Sending second pattern of inputs (read from file)
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         for layer in nn_config['inputs']:
             i_bits = layer["integer_bits"]
             f_bits = layer["fractional_bits"]
@@ -335,42 +332,42 @@ def write_verilog_testbench(nn_config, testbench_output_path):
             if pragma == 'stream':
                 f.write(f'        {name}_tvalid = 1;\n')
             f.write(f'        file = $fopen("../../../../{input_file}", "r");\n')
-            f.write(f'        if (file == 0) begin\n')
+            f.write('        if (file == 0) begin\n')
             f.write(f'            $display("Error opening file {input_file}");\n')
-            f.write(f'            $finish;\n')
-            f.write(f'        end\n')
+            f.write('            $finish;\n')
+            f.write('        end\n')
             f.write(f'        for (j = 0; j < {fifo_depth}; j = j + 1) begin\n')
             # For each line, read batch_size values:
             for k in range(batch_size):
                 upper = (k + 1) * total_bits - 1
                 lower = k * total_bits
-                f.write(f'            r = $fscanf(file, "%d", value);\n')
+                f.write('            r = $fscanf(file, "%d", value);\n')
                 if pragma == 'stream':
                     f.write(f'            {name}_tdata[{upper}:{lower}] = value;\n')
                 else:
                     f.write(f'            {name}_{k} = value;\n')
-            if pragma == 'stream':    
+            if pragma == 'stream':
                 f.write(f'            while ({name}_tready == 0) @(posedge ap_clk);\n')
                 f.write('            @(posedge ap_clk);\n')
             f.write('        end\n')
             if pragma == 'partition':
-                f.write(f'        // Assert valid signals\n')
+                f.write('        // Assert valid signals\n')
                 for k in range(batch_size):
-                   f.write(f'        {name}_{k}_ap_vld = 1;\n')
+                    f.write(f'        {name}_{k}_ap_vld = 1;\n')
                 f.write('        // Start the DUT\n')
-                f.write(f'        ap_start = 1;\n')  
-                f.write(f'        @(posedge ap_clk);\n')
-                f.write(f'        ap_start = 0;\n') 
-                f.write(f'        // Deassert valid signals\n')
+                f.write('        ap_start = 1;\n')
+                f.write('        @(posedge ap_clk);\n')
+                f.write('        ap_start = 0;\n')
+                f.write('        // Deassert valid signals\n')
                 for k in range(batch_size):
-                   f.write(f'        {name}_{k}_ap_vld = 0;\n') 
-                f.write(f'\n') 
+                    f.write(f'        {name}_{k}_ap_vld = 0;\n')
+                f.write('\n')
 
         f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Output Data Capture and Logging
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Output Data Capture and Logging\n')
         f.write('    // Capture output for 2nd input (done_counter == 1) and log them to CSV.\n')
@@ -390,32 +387,34 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 f.write(f'    real real_val_{i};\n')
             else:
                 f.write(f'    reg signed [{total_bits-1}:0] fixed_val_{i};\n')
-                f.write(f'    real real_val_{i};\n')               
-            f.write(f'    always @(posedge ap_clk) begin\n')
+                f.write(f'    real real_val_{i};\n')
+            f.write('    always @(posedge ap_clk) begin\n')
             if pragma == 'stream':
                 f.write(f'        if (done_counter == 1 && {layer_name}_tvalid && {layer_name}_tready) begin\n')
                 f.write(f'            for (idx_{i} = 0; idx_{i} < {batch_size}; idx_{i} = idx_{i} + 1) begin\n')
                 f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
                 f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-                f.write(f'                $display("Output {layer_name}[%0d]: integer_bits=%0d fractional_bits=%0d value=%f", idx_{i}, {i_bits}, {f_bits}, real_val_{i});\n')
+                f.write(f'                $display("Output {layer_name}[%0d]: %f", idx_{i}, real_val_{i});\n')
                 f.write('                // Log result to CSV\n')
                 f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
                 f.write('            end\n')
             else:
-                f.write(f'        // Note: The expected behavior in most cases is to have valid outputs (ap_vld=1) when ap_done = 1\n')
-                f.write(f'        if (done_counter == 1 && ap_done == 1) begin\n')
-                for idx in range (batch_size):
+                f.write(
+                    '        // Note: The usual expected behavior is to have valid outputs (ap_vld=1) when ap_done = 1\n'
+                )
+                f.write('        if (done_counter == 1 && ap_done == 1) begin\n')
+                for idx in range(batch_size):
                     f.write(f'            fixed_val_{i} = {layer_name}_{idx}[{total_bits - 1}:0];\n')
                     f.write(f'            real_val_{i} = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-                    f.write(f'            $display("Output {layer_name}_{idx}: integer_bits=%0d fractional_bits=%0d value=%f", {i_bits}, {f_bits}, real_val_{i});\n')
+                    f.write(f'            $display("Output {layer_name}_{idx}: %f", real_val_{i});\n')
                     f.write('            // Log result to CSV\n')
-                    f.write(f'            $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", {idx}, real_val_{i});\n')        
+                    f.write(f'            $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", {idx}, real_val_{i});\n')
             f.write('        end\n')
             f.write('    end\n\n')
 
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         # Latency Measurement and Test End
-        #----------------------------------------------------------------------
+        # ----------------------------------------------------------------------
         f.write('    //------------------------------------------------------------------------\n')
         f.write('    // Latency Measurement\n')
         f.write('    // Measures the cycle count between start and subsequent ap_done signals.\n')
@@ -424,7 +423,7 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         if rst_name == 'ap_rst_n':
             f.write(f'        if (!{rst_name}) begin\n')
         else:
-             f.write(f'        if ({rst_name}) begin\n')           
+            f.write(f'        if ({rst_name}) begin\n')
         f.write('            old_ap_done <= 0;\n')
         f.write('        end else begin\n')
         f.write('            old_ap_done <= ap_done;\n')
@@ -501,6 +500,7 @@ def prepare_testbench_input(data, fifo_depth, batch_size):
     data_reshaped = data_arr.reshape((fifo_depth, batch_size))
     return data_reshaped
 
+
 def read_testbench_log(testbench_log_path, outputs):
     """
     Reads the testbench log file and returns a dictionary
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 7f013253b2..9ef5503ab5 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -731,10 +731,12 @@ def write_bridge_multigraph(self, model):
                 for graph_idx, g in enumerate(model.graphs):
                     newline += '#undef DEFINES_H_\n'
                     if len(g.outputs) == 1:
-                        newline += '#define result_t ' + 'result_graph' + str(graph_idx+1) + '_t\n'
+                        newline += '#define result_t ' + 'result_graph' + str(graph_idx + 1) + '_t\n'
                     newline += line.replace('myproject', format(model.graphs[graph_idx].config.config['ProjectName']))
                     if len(g.outputs) == 1:
-                        newline += 'typedef result_graph' + str(graph_idx+1) + '_t graph' + str(graph_idx+1) + '_result_t;\n'
+                        newline += (
+                            'typedef result_graph' + str(graph_idx + 1) + '_t graph' + str(graph_idx + 1) + '_result_t;\n'
+                        )
                         newline += '#undef result_t\n\n' if graph_idx < len(model.graphs) - 1 else '\n'
                 newline += '\n'
             elif 'myproject' in line:
@@ -768,8 +770,8 @@ def write_bridge_multigraph(self, model):
                     for o in g.get_output_variables():
                         definition = o.definition_cpp(name_suffix='_ap')
                         if len(g.outputs) == 1:
-                            parts = definition.split(' ', 1)  
-                            datatype = 'graph'+str(idx+1) + '_result_t'
+                            parts = definition.split(' ', 1)
+                            datatype = 'graph' + str(idx + 1) + '_result_t'
                             if parts[0].startswith('hls::stream'):
                                 modified_definition = 'hls::stream<' + datatype + '> ' + parts[1]
                             else:
@@ -786,8 +788,10 @@ def write_bridge_multigraph(self, model):
                     if idx == 0:
                         input_vars = ','.join([i.name + '_ap' for i in g.get_input_variables()])
                     else:
-                        input_vars =  output_vars
-                    bram_vars = ','.join([b.name for b in [var for var in g.get_weight_variables() if var.storage.lower() == 'bram']])
+                        input_vars = output_vars
+                    bram_vars = ','.join(
+                        [b.name for b in [var for var in g.get_weight_variables() if var.storage.lower() == 'bram']]
+                    )
                     output_vars = ','.join([o.name + '_ap' for o in g.get_output_variables()])
                     # Concatenate the input, output, and bram variables. Filter out empty/null values
                     all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
@@ -833,7 +837,6 @@ def write_bridge_multigraph(self, model):
         f.close()
         fout.close()
 
-
     def write_build_script(self, model):
         """Write the TCL/Shell build scripts (project.tcl, build_prj.tcl, vivado_synth.tcl, build_lib.sh)
 
@@ -881,26 +884,26 @@ def write_build_script(self, model):
 
                 dst.write(line)
         build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC)
-    
+
     def write_build_script_multigraph(self, model):
-            """Write the build script (build_lib.sh) for stitched multigraph project
-            Args:
-                model (MultiModelGraph): the hls4ml multigraph model.
-            """
-            filedir = Path(__file__).parent
-            os.makedirs(model.config.get_output_dir(), exist_ok=True)
-            build_lib_src = (filedir / '../templates/vivado/build_lib_multigraph.sh').resolve()
-            build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve()
-            graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in model.graphs)
-
-            with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
-                for line in src.readlines():
-                    line = line.replace('myproject', model.config.config['OriginalProjectName'])
-                    line = line.replace('myproject_stitched', model.config.config['ProjectName'])
-                    line = line.replace('mystamp', model.config.config['Stamp'])
-                    line = line.replace('mygraph_name_list', graph_project_names)
-                    dst.write(line)
-            os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
+        """Write the build script (build_lib.sh) for stitched multigraph project
+        Args:
+            model (MultiModelGraph): the hls4ml multigraph model.
+        """
+        filedir = Path(__file__).parent
+        os.makedirs(model.config.get_output_dir(), exist_ok=True)
+        build_lib_src = (filedir / '../templates/vivado/build_lib_multigraph.sh').resolve()
+        build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve()
+        graph_project_names = ' '.join(f"\"{g.config.get_output_dir().split('/')[-1]}\"" for g in model.graphs)
+
+        with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
+            for line in src.readlines():
+                line = line.replace('myproject', model.config.config['OriginalProjectName'])
+                line = line.replace('myproject_stitched', model.config.config['ProjectName'])
+                line = line.replace('mystamp', model.config.config['Stamp'])
+                line = line.replace('mygraph_name_list', graph_project_names)
+                dst.write(line)
+        os.chmod(build_lib_dst, os.stat(build_lib_dst).st_mode | stat.S_IEXEC)
 
     def write_nnet_utils(self, model):
         """Copy the nnet_utils, AP types headers and any custom source to the project output directory
@@ -1031,4 +1034,4 @@ def write_hls(self, model, is_multigraph=False):
             self.write_build_script_multigraph(model)
             self.write_bridge_multigraph(model)
             self.write_multigraph_weights(model)
-            print('Done')   
+            print('Done')
diff --git a/test/pytest/test_multi_graph.py b/test/pytest/test_multi_graph.py
index 2f361c4709..8c33348bf9 100644
--- a/test/pytest/test_multi_graph.py
+++ b/test/pytest/test_multi_graph.py
@@ -1,12 +1,15 @@
 from pathlib import Path
+
 import numpy as np
 import pytest
 import tensorflow as tf
-from tensorflow.keras.layers import Input, Conv2D, Activation, MaxPooling2D, Flatten, Dense
+from tensorflow.keras.layers import Activation, Conv2D, Dense, Flatten, Input, MaxPooling2D
+
 import hls4ml
 
 test_root_path = Path(__file__).parent
 
+
 def create_test_model():
     """
     This architecture ensures testing of corner cases such as:
@@ -21,16 +24,14 @@ def create_test_model():
     output1 = Dense(5, activation='relu', name='dense1')(x)
     output2 = Dense(5, activation='relu', name='dense2')(x)
     model = tf.keras.Model(inputs=inp, outputs=[output1, output2])
-    
+
     return model
 
+
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('strategy', ['latency'])
 @pytest.mark.parametrize('granularity', ['model', 'name'])
-@pytest.mark.parametrize('split_layers', [
-    ('pool1', 'dense_common'),
-    ('relu1', 'flatten')
-])
+@pytest.mark.parametrize('split_layers', [('pool1', 'dense_common'), ('relu1', 'flatten')])
 def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
     """
     Tests the multi-graph splitting and stitching process.
@@ -42,7 +43,6 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
     model = create_test_model()
     model.compile(optimizer='adam', loss='categorical_crossentropy')
     X_input = np.random.rand(5, 4, 4, 3).astype(np.float32)
-    keras_pred = model.predict(X_input)
 
     config = hls4ml.utils.config_from_keras_model(model, granularity=granularity, default_precision='ap_fixed<32,16>')
     config['Model']['Strategy'] = strategy
@@ -52,11 +52,7 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
 
     # --- Monolithic HLS conversion (no split) ---
     hls_model_mono = hls4ml.converters.convert_from_keras_model(
-        model,
-        hls_config=config,
-        output_dir=output_dir_mono,
-        backend=backend,
-        io_type=io_type
+        model, hls_config=config, output_dir=output_dir_mono, backend=backend, io_type=io_type
     )
     hls_model_mono.compile()
     pred_mono = hls_model_mono.predict(X_input)
@@ -68,7 +64,7 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
         output_dir=output_dir_multi,
         backend=backend,
         io_type=io_type,
-        split_layer_names=list(split_layers)
+        split_layer_names=list(split_layers),
     )
     hls_model_multi.compile()
     pred_multi = hls_model_multi.predict(X_input)
@@ -78,17 +74,26 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
 
     for mono_out, multi_out in zip(pred_mono, pred_multi):
         np.testing.assert_allclose(multi_out, mono_out, rtol=0, atol=1e-5)
-    
+
     if granularity == 'name':
         if io_type == 'io_parallel' and split_layers == ('relu1', 'flatten'):
-            pytest.skip("Skipping RTL simulation for io_parallel with split layer at flatten due to improper simulation behavior.")
+            pytest.skip(
+                "Skipping RTL simulation for io_parallel with split layer at flatten due to improper simulation behavior."
+            )
 
         # --- Optional: Build the HLS project and run simulation ---
-        hls_model_multi.build(csim=False, cosim=False, vsynth=False, export=True, 
-                        stitch_design=True, sim_stitched_design=True, export_stitched_design=True)
+        hls_model_multi.build(
+            csim=False,
+            cosim=False,
+            vsynth=False,
+            export=True,
+            stitch_design=True,
+            sim_stitched_design=True,
+            export_stitched_design=True,
+        )
 
         # test only the first sample, as batch prediction is not supported for stitched RTL simulations
         inp = np.expand_dims(X_input[0], axis=0)
-        sim_results = hls_model_multi.predict(inp, sim = 'rtl')
+        sim_results = hls_model_multi.predict(inp, sim='rtl')
         for sim_out, pred_out in zip(sim_results, list([pred_multi[0][0], pred_multi[1][0]])):
             np.testing.assert_allclose(sim_out, pred_out, rtol=0, atol=0.3)

From 773c411c50cf5b838b9cedaf9c3479533b8c1aa0 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Mon, 10 Mar 2025 11:31:22 +0100
Subject: [PATCH 44/50] removed pandas dependency in read_testbench_log

---
 hls4ml/utils/simulation_utils.py | 79 +++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index d9f610d932..5c75d962e0 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -1,7 +1,8 @@
+import csv
 import os
+from collections import defaultdict
 
 import numpy as np
-import pandas as pd
 
 
 def write_verilog_testbench(nn_config, testbench_output_path):
@@ -510,33 +511,55 @@ def read_testbench_log(testbench_log_path, outputs):
         return {}
 
     try:
-        df = pd.read_csv(testbench_log_path)
-        BestLatency = df[df['output_name'] == 'BestLatency']['value'].iloc[0]
-        WorstLatency = df[df['output_name'] == 'WorstLatency']['value'].iloc[0]
-        output_df = df[~df['output_name'].isin(['BestLatency', 'WorstLatency'])]
-
-        sim_dict = {'BestLatency': int(BestLatency), 'WorstLatency': int(WorstLatency), 'BehavSimResults': []}
-
-        ordered_output_names = [entry['name'] for entry in outputs]
-        for name in ordered_output_names:
-            group = output_df[output_df['output_name'] == name]
-            if group.empty:
-                print(f"Warning: Expected output '{name}' not found in testbench log.")
-                continue
-
-            indices = group['index'].astype(int)
-            values = group['value'].astype(float)
-            array = np.zeros(max(indices) + 1, dtype=np.float64)
-            array[indices] = values
-            sim_dict['BehavSimResults'].append(array)
-
-        if len(sim_dict['BehavSimResults']) == 1:
-            sim_dict['BehavSimResults'] = sim_dict['BehavSimResults'][0]
-
-        return sim_dict
-
-    except (KeyError, IndexError) as e:
-        print(f"Error: Missing expected columns or values in the file: {e}")
+        with open(testbench_log_path, encoding='utf-8') as file:
+            reader = csv.reader(file)
+            header = next(reader)
+            required_columns = {'output_name', 'value', 'index'}
+            if not required_columns.issubset(set(header)):
+                print("Error: Missing required columns in the CSV file.")
+                return {}
+
+            col_index = {col: idx for idx, col in enumerate(header)}
+            best_latency = worst_latency = None
+            output_data = defaultdict(list)
+
+            for row in reader:
+                output_name = row[col_index['output_name']]
+                value = row[col_index['value']]
+                if output_name == 'BestLatency':
+                    best_latency = int(value)
+                elif output_name == 'WorstLatency':
+                    worst_latency = int(value)
+                else:
+                    index = int(row[col_index['index']])
+                    output_data[output_name].append((index, float(value)))
+
+            if best_latency is None or worst_latency is None:
+                print("Error: BestLatency or WorstLatency not found.")
+                return {}
+            sim_dict = {'BestLatency': best_latency, 'WorstLatency': worst_latency, 'BehavSimResults': []}
+            ordered_output_names = [entry['name'] for entry in outputs]
+
+            for name in ordered_output_names:
+                if name not in output_data:
+                    print(f"Warning: Expected output '{name}' not found in testbench log.")
+                    continue
+
+                indices_values = output_data[name]
+                max_index = max(index for index, _ in indices_values)
+                array = np.zeros(max_index + 1, dtype=np.float64)
+                for index, value in indices_values:
+                    array[index] = value
+                sim_dict['BehavSimResults'].append(array)
+
+            # If only one set of results, return it as a single array instead of a list
+            if len(sim_dict['BehavSimResults']) == 1:
+                sim_dict['BehavSimResults'] = sim_dict['BehavSimResults'][0]
+
+            return sim_dict
+
+    except (KeyError, IndexError, ValueError) as e:
+        print(f"Error: Issue with CSV file format or data: {e}")
         return {}
     except Exception as e:
         print(f"An unexpected error occurred: {e}")

From b91f97a750bba902971a1fcef7ae35e6823850a5 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Fri, 14 Mar 2025 16:24:05 +0100
Subject: [PATCH 45/50] Ensure stitched RTL simulation results align with CSim
 output

---
 hls4ml/model/graph.py            | 12 +++++++++++-
 hls4ml/utils/simulation_utils.py |  8 ++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index f67bc89208..db6e0ba8f3 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1189,7 +1189,17 @@ def predict(self, x, sim='csim'):
                 graph_reports=self.graph_reports,
                 simulation_input_data=x,
             )
-            return stitched_report['BehavSimResults']
+
+            results = stitched_report.get('BehavSimResults', [])
+            if isinstance(results, np.ndarray):
+                return results.astype(np.float32) if x.dtype in [np.single, np.float32] else results.astype(np.float64)
+            elif isinstance(results, list):
+                return [
+                    arr.astype(np.float32) if x.dtype in [np.single, np.float32] else arr.astype(np.float64)
+                    for arr in results
+                ]
+            else:
+                return results
         else:
             print('Unknown simulation option given.')
 
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index 5c75d962e0..d0b8ac781c 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -395,9 +395,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 f.write(f'            for (idx_{i} = 0; idx_{i} < {batch_size}; idx_{i} = idx_{i} + 1) begin\n')
                 f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
                 f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-                f.write(f'                $display("Output {layer_name}[%0d]: %f", idx_{i}, real_val_{i});\n')
+                f.write(f'                $display("Output {layer_name}[%0d]: %.15f", idx_{i}, real_val_{i});\n')
                 f.write('                // Log result to CSV\n')
-                f.write(f'                $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
+                f.write(f'                $fwrite(csv_file, "%s,%0d,%.15f\\n", "{layer_name}", idx_{i}, real_val_{i});\n')
                 f.write('            end\n')
             else:
                 f.write(
@@ -407,9 +407,9 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 for idx in range(batch_size):
                     f.write(f'            fixed_val_{i} = {layer_name}_{idx}[{total_bits - 1}:0];\n')
                     f.write(f'            real_val_{i} = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
-                    f.write(f'            $display("Output {layer_name}_{idx}: %f", real_val_{i});\n')
+                    f.write(f'            $display("Output {layer_name}_{idx}: %.15f", real_val_{i});\n')
                     f.write('            // Log result to CSV\n')
-                    f.write(f'            $fwrite(csv_file, "%s,%0d,%f\\n", "{layer_name}", {idx}, real_val_{i});\n')
+                    f.write(f'            $fwrite(csv_file, "%s,%0d,%.15f\\n", "{layer_name}", {idx}, real_val_{i});\n')
             f.write('        end\n')
             f.write('    end\n\n')
 

From 3dcd0d5bd8714bc1b2177a4bc3ca13052f144183 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 16 Apr 2025 10:47:55 +0200
Subject: [PATCH 46/50] parallel subgraph compilation

---
 .../templates/vivado/build_lib_multigraph.sh  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
index 8a7c13dbc1..ee6eb83723 100644
--- a/hls4ml/templates/vivado/build_lib_multigraph.sh
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -21,20 +21,29 @@ WEIGHTS_DIR="\"${BASEDIR}/stitched/firmware/weights\""
 
 mkdir -p "${OUTPUT_DIR}"
 
-# Compile all graphs
+# Compile all graphs in parallel
 OBJECT_FILES=()
+PIDS=()
+
 for g in "${graph_project_names[@]}"; do
     SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
     OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
     AP_TYPES_PATH="-I${BASEDIR}/${g}/firmware/ap_types/"
-
-    ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
+    (
+        ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
+    ) &
+    PIDS+=($!)
     OBJECT_FILES+=("${OBJ_FILE}")
-    INCFLAGS+="-I${BASEDIR}/${g}/ "
+    INCFLAGS+="-I${BASEDIR}/${g}/ "  # Only generic include paths here
 done
 
-${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -c "${PROJECT}_bridge.cpp" -o ${PROJECT}_bridge.o
+for pid in "${PIDS[@]}"; do
+    wait $pid
+done
+
+AP_TYPES_PATH="-I${BASEDIR}/${graph_project_names[@]: -1}/firmware/ap_types/"
 
+${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -c "${PROJECT}_bridge.cpp" -o ${PROJECT}_bridge.o
 ${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -shared "${OBJECT_FILES[@]}" ${PROJECT}_bridge.o -o "${OUTPUT_DIR}/${PROJECT}-${LIB_STAMP}.so"
 
 rm -f "${OBJECT_FILES[@]}"

From fa3e679297146f20d9f396565509fb17c521d0c9 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 16 Apr 2025 12:30:13 +0200
Subject: [PATCH 47/50] added additional checks in ip_stitcher

---
 .../templates/vivado/build_lib_multigraph.sh  |  2 +-
 hls4ml/templates/vivado/ip_stitcher.tcl       | 24 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vivado/build_lib_multigraph.sh b/hls4ml/templates/vivado/build_lib_multigraph.sh
index ee6eb83723..5e22596062 100644
--- a/hls4ml/templates/vivado/build_lib_multigraph.sh
+++ b/hls4ml/templates/vivado/build_lib_multigraph.sh
@@ -34,7 +34,7 @@ for g in "${graph_project_names[@]}"; do
     ) &
     PIDS+=($!)
     OBJECT_FILES+=("${OBJ_FILE}")
-    INCFLAGS+="-I${BASEDIR}/${g}/ "  # Only generic include paths here
+    INCFLAGS+="-I${BASEDIR}/${g}/ "
 done
 
 for pid in "${PIDS[@]}"; do
diff --git a/hls4ml/templates/vivado/ip_stitcher.tcl b/hls4ml/templates/vivado/ip_stitcher.tcl
index 71146884bb..4f6d7c27ff 100644
--- a/hls4ml/templates/vivado/ip_stitcher.tcl
+++ b/hls4ml/templates/vivado/ip_stitcher.tcl
@@ -139,6 +139,16 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
     # Create external ports for 'ap_clk' and 'ap_rst'
     # ap_clk
     if {[llength $ap_clk_ports] > 0} {
+        set clk_freq [get_property CONFIG.FREQ_HZ [lindex $ap_clk_ports 0]]
+
+        # Warn if modules are synthesized with different clk
+        foreach clk_pin $ap_clk_ports {
+            if {[get_property CONFIG.FREQ_HZ $clk_pin] ne $clk_freq} {
+                puts "Warning: Inconsistent CONFIG.FREQ_HZ for ap_clk ports."
+                break
+            }
+        }
+        # NOTE: Probably we will need the lowest clock frequency among all IPs here
         create_bd_port -dir I -type clk -freq_hz 100000000 ap_clk
         set ap_clk_port [get_bd_ports ap_clk]
         # Connect all 'ap_clk' pins to the 'ap_clk' port
@@ -153,6 +163,14 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
         set sample_rst_pin [lindex $ap_rst_ports 0]
         set rst_polarity [get_property CONFIG.POLARITY $sample_rst_pin]
 
+        foreach ap_rst_port $ap_rst_ports {
+            # All ports should have the same polarity
+            if {[get_property CONFIG.POLARITY $ap_rst_port] ne $rst_polarity} {
+                puts "Error: Inconsistent CONFIG.POLARITY for ap_rst ports. Aborting."
+                exit 1
+            }
+        }
+
         # Only proceed if the polarity is defined
         if {$rst_polarity ne ""} {
             # Create the 'ap_rst' port
@@ -247,6 +265,9 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
                     set layer_out_ports_by_index($index) $port
                 } elseif {[regexp {^layer(?:\d+_)?out_(\d+)_ap_vld$} $port_name all index]} {
                     set layer_out_vld_ports_by_index($index) $port
+                } else {
+                    # NOTE: We expect data ports to follow the previous naming pattern
+                    # NOTE: This is not treated as an error because it might be a valid control port or non-standard signal
                 }
             }
 
@@ -288,7 +309,8 @@ proc stitch_procedure {base_dir stitch_project_name original_project_name bd_nam
                     # Connect the ports
                     connect_bd_net $out_vld_port $in_vld_port
                 } else {
-                    puts "Warning: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
+                    puts "Error: No matching input ap_vld port found for output [get_property NAME $out_vld_port]"
+                    exit 1
                 }
             }
 

From 05d22d3718f193fe8ac85d1a305e0c7f1ee240dd Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 16 Apr 2025 14:53:51 +0200
Subject: [PATCH 48/50] small improvements on MultiModelGraph

---
 hls4ml/converters/keras_to_hls.py |  3 ---
 hls4ml/model/graph.py             | 15 ++++++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index b0f1964331..a60eca2256 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -327,10 +327,7 @@ def keras_to_hls(config, split_layer_names=None):
     layer_list, input_layers, output_layers, output_shapes = parse_keras_model(model_arch, reader)
 
     print('Creating HLS model...')
-    merge_layers = ['add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot']
     if split_layer_names:
-        if any(any(layer in name for layer in merge_layers) for name in split_layer_names):
-            raise ValueError('Split layer must not be a merge layer')
         hls_model = ModelGraph.make_multi_graph(
             config, layer_list, input_layers, output_layers, output_shapes, split_layer_names
         )
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index db6e0ba8f3..4032e6ccc7 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -920,11 +920,16 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
             raise ValueError("No split layer names provided.")
 
         layer_names = [layer['name'] for layer in layer_list]
+        restricted_merge_layers = {'add', 'subtract', 'multiply', 'average', 'maximum', 'minimum', 'concatenate', 'dot'}
 
-        # NOTE - Might need to validate again that split layer names exist in layer list
-        for name in split_layer_names:
-            if name not in layer_names:
-                raise ValueError(f"Layer '{name}' not found in the model.")
+        # Validate that each provided split layer exists and is not a merge layer.
+        for split_layer in split_layer_names:
+            if split_layer not in layer_names:
+                raise ValueError(f"Layer '{split_layer}' not found in the model.")
+
+            layer = next(layer for layer in layer_list if layer['name'] == split_layer)
+            if layer.get('class_name', "").lower() in restricted_merge_layers:
+                raise ValueError('Split layer must not be a merge layer')
 
         # Split the layer_list into subgraphs
         split_indices = sorted([layer_names.index(name) for name in split_layer_names])
@@ -1171,7 +1176,7 @@ def build_wrapper(idx, g, **kwargs):
 
     def compile(self):
         for g in self.graphs:
-            g.compile()
+            g.write()
         # Bypass VitisWriter and invoke write_hls directly from VivadoWriter
         super(self.backend.writer.__class__, self.backend.writer).write_hls(self, is_multigraph=True)
         self._compile()

From 3a74eea9fc6715ac6c2523c51d004005129e7614 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 30 Apr 2025 14:57:43 +0200
Subject: [PATCH 49/50] correct AXIS port slicing for Verilog simulation

---
 hls4ml/backends/vitis/vitis_backend.py | 11 ++++-
 hls4ml/model/graph.py                  |  2 +
 hls4ml/utils/simulation_utils.py       | 64 +++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index a0c2207c96..8fe490369a 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -9,6 +9,7 @@
 from hls4ml.model.flow import get_flow, register_flow
 from hls4ml.report import aggregate_graph_reports, parse_vivado_report
 from hls4ml.utils.simulation_utils import (
+    annotate_axis_stream_widths,
     prepare_testbench_input,
     prepare_zero_input,
     read_testbench_log,
@@ -150,6 +151,7 @@ def build(
 
     def build_stitched_design(
         self,
+        model,
         stitch_design=True,
         sim_stitched_design=False,
         export_stitched_design=False,
@@ -179,8 +181,13 @@ def build_stitched_design(
             print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {nn_config['StitchedProjectName']} folder.")
 
         if nn_config:
-            with open(nn_config_path, "w") as file:
-                json.dump(nn_config, file, indent=4)
+            if nn_config['outputs'][0]['pragma'] == 'stream':
+                last_graph_project_path = os.path.join(
+                    model.graphs[-1].config.get_output_dir(), model.graphs[-1].config.get_project_dir()
+                )
+                annotate_axis_stream_widths(nn_config, last_graph_project_path)
+                with open(nn_config_path, "w") as file:
+                    json.dump(nn_config, file, indent=4)
 
         if sim_stitched_design:
             write_verilog_testbench(nn_config, testbench_path)
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 4032e6ccc7..1301ecdcda 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1164,6 +1164,7 @@ def build_wrapper(idx, g, **kwargs):
                 raise FileExistsError(f"Vivado stitched project folder '{vivado_folder}' already exists.")
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
+                self,
                 stitch_design=stitch_design,
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
@@ -1187,6 +1188,7 @@ def predict(self, x, sim='csim'):
         elif sim == 'rtl':
             nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
+                self,
                 stitch_design=False,
                 sim_stitched_design=True,
                 export_stitched_design=False,
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index d0b8ac781c..a38be3babc 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -1,10 +1,66 @@
 import csv
 import os
+import xml.etree.ElementTree as ET
 from collections import defaultdict
 
 import numpy as np
 
 
+def parse_component_xml(component_xml_path):
+    """
+    Parse the component.xml from the generated Vivado IP and return a dict of the port metadata.
+    """
+    if not os.path.exists(component_xml_path):
+        raise FileNotFoundError(f"component.xml not found at {component_xml_path}")
+
+    tree = ET.parse(component_xml_path)
+    root = tree.getroot()
+
+    ns = {
+        'spirit': 'http://www.spiritconsortium.org/XMLSchema/SPIRIT/1685-2009',
+        'xilinx': 'http://www.xilinx.com',
+        'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
+    }
+
+    ports = root.findall('.//spirit:model/spirit:ports/spirit:port', namespaces=ns)
+    port_dict = {}
+
+    for port in ports:
+        name = port.find('spirit:name', namespaces=ns).text
+        wire = port.find('spirit:wire', namespaces=ns)
+        if wire is not None:
+            direction = wire.find('spirit:direction', namespaces=ns).text
+            vector = wire.find('spirit:vector', namespaces=ns)
+            if vector is not None:
+                left = vector.find('spirit:left', namespaces=ns).text
+                right = vector.find('spirit:right', namespaces=ns).text
+                width = abs(int(left) - int(right)) + 1
+            else:
+                width = 1
+
+            port_dict[name] = {'direction': direction, 'width': width}
+
+    return port_dict
+
+
+def annotate_axis_stream_widths(nn_config, vivado_project_path):
+    """
+    Annotate nn_config with Vivado IP AXIS bus and per-element widths.
+    """
+    component_xml_path = os.path.join(vivado_project_path, 'solution1/impl/ip/component.xml')
+    port_dict = parse_component_xml(component_xml_path)
+    for layer in nn_config.get('outputs', []):
+        batch = layer['batch_size']
+        # find the TDATA port (case-insensitive)
+        port, info = next(((p, i) for p, i in port_dict.items() if p.lower() == f"{layer['name']}_tdata"), (None, None))
+        if port is None:
+            raise KeyError(f"No TDATA port for '{layer['name']}' in {component_xml_path}")
+        w = info['width']
+        if w % batch:
+            raise ValueError(f"Bus width ({w}) not divisible by No of output elements ({batch})")
+        layer.update({'axis_bus_width': w, 'axis_element_width': w // batch})
+
+
 def write_verilog_testbench(nn_config, testbench_output_path):
     """
     Generate a Verilog testbench for a given neural network configuration.
@@ -71,7 +127,8 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 name = layer["name"]
                 total_bits = layer['integer_bits'] + layer['fractional_bits']
                 batch_size = layer['batch_size']
-                f.write(f'    wire [{(total_bits * batch_size) - 1}:0] {name}_tdata;\n')
+                axis_bus_width = layer['axis_bus_width']
+                f.write(f'    wire [{axis_bus_width - 1}:0] {name}_tdata;\n')
                 f.write(f'    wire {name}_tvalid;\n')
                 f.write(f'    reg  {name}_tready;\n\n')
         else:
@@ -391,9 +448,12 @@ def write_verilog_testbench(nn_config, testbench_output_path):
                 f.write(f'    real real_val_{i};\n')
             f.write('    always @(posedge ap_clk) begin\n')
             if pragma == 'stream':
+                axis_element_width = layer['axis_element_width']
                 f.write(f'        if (done_counter == 1 && {layer_name}_tvalid && {layer_name}_tready) begin\n')
                 f.write(f'            for (idx_{i} = 0; idx_{i} < {batch_size}; idx_{i} = idx_{i} + 1) begin\n')
-                f.write(f'                fixed_val_{i} = {layer_name}_tdata[(idx_{i}+1)*{total_bits}-1 -: {total_bits}];\n')
+                f.write(
+                    f'                fixed_val_{i} = {layer_name}_tdata[idx_{i}*{axis_element_width} +: {total_bits}];\n'
+                )
                 f.write(f'                real_val_{i}  = fixed_val_{i} / (1.0 * (1 << {f_bits}));\n')
                 f.write(f'                $display("Output {layer_name}[%0d]: %.15f", idx_{i}, real_val_{i});\n')
                 f.write('                // Log result to CSV\n')

From da42a6da4469ed569bbb9850951bd2fc15425525 Mon Sep 17 00:00:00 2001
From: dimdano <dimitrios.danopoulos@cern.ch>
Date: Wed, 14 May 2025 09:50:37 +0200
Subject: [PATCH 50/50] Generate Verilog testbench inputs using C++ bridge

---
 hls4ml/backends/vitis/vitis_backend.py       | 37 +++------
 hls4ml/model/graph.py                        | 48 +++++++++++-
 hls4ml/templates/vivado/myproject_bridge.cpp |  2 +
 hls4ml/utils/simulation_utils.py             | 79 +++++++-------------
 hls4ml/writer/vivado_writer.py               | 64 ++++++++++++++++
 test/pytest/test_multi_graph.py              | 36 +++++----
 6 files changed, 166 insertions(+), 100 deletions(-)

diff --git a/hls4ml/backends/vitis/vitis_backend.py b/hls4ml/backends/vitis/vitis_backend.py
index 8fe490369a..738fd880b6 100644
--- a/hls4ml/backends/vitis/vitis_backend.py
+++ b/hls4ml/backends/vitis/vitis_backend.py
@@ -10,10 +10,8 @@
 from hls4ml.report import aggregate_graph_reports, parse_vivado_report
 from hls4ml.utils.simulation_utils import (
     annotate_axis_stream_widths,
-    prepare_testbench_input,
-    prepare_zero_input,
+    prepare_tb_inputs,
     read_testbench_log,
-    write_testbench_input,
     write_verilog_testbench,
 )
 
@@ -155,11 +153,11 @@ def build_stitched_design(
         stitch_design=True,
         sim_stitched_design=False,
         export_stitched_design=False,
-        nn_config=None,
         graph_reports=None,
         simulation_input_data=None,
     ):
 
+        nn_config = model.nn_config
         os.makedirs(nn_config['OutputDir'], exist_ok=True)
         stitched_design_dir = os.path.join(nn_config['OutputDir'], nn_config['StitchedProjectName'])
         if stitch_design:
@@ -180,31 +178,18 @@ def build_stitched_design(
         except Exception as e:
             print(f"Error: {e}. Cannot copy 'ip_stitcher.tcl' to {nn_config['StitchedProjectName']} folder.")
 
-        if nn_config:
-            if nn_config['outputs'][0]['pragma'] == 'stream':
-                last_graph_project_path = os.path.join(
-                    model.graphs[-1].config.get_output_dir(), model.graphs[-1].config.get_project_dir()
-                )
-                annotate_axis_stream_widths(nn_config, last_graph_project_path)
-                with open(nn_config_path, "w") as file:
-                    json.dump(nn_config, file, indent=4)
+        if nn_config['outputs'][0]['pragma'] == 'stream':
+            last_graph_project_path = os.path.join(
+                model.graphs[-1].config.get_output_dir(), model.graphs[-1].config.get_project_dir()
+            )
+            annotate_axis_stream_widths(nn_config, last_graph_project_path)
+        with open(nn_config_path, "w") as file:
+            json.dump(nn_config, file, indent=4)
 
         if sim_stitched_design:
             write_verilog_testbench(nn_config, testbench_path)
-            # Produce a testbench input file for every input layer
-            for i, layer in enumerate(nn_config['inputs']):
-                testbench_input_path = os.path.join(stitched_design_dir, f"{layer['name']}_input_data.txt")
-                # We reshape input simulation data to (fifo_depth, batch_size)
-                if simulation_input_data is None:
-                    input_data_reshaped = prepare_zero_input(layer)
-                    print("No simulation input provided. Using zero-filled inputs.")
-                else:
-                    # Handles both single and multi-layer cases. First dim should always be batch size
-                    data = simulation_input_data[i]
-                    input_data_reshaped = prepare_testbench_input(data, layer['fifo_depth'], layer['batch_size'])
-                write_testbench_input(
-                    input_data_reshaped, testbench_input_path, layer['integer_bits'], layer['fractional_bits']
-                )
+            tb_inputs = prepare_tb_inputs(simulation_input_data, nn_config['inputs'])
+            model.write_tb_inputs(tb_inputs, stitched_design_dir)
             print('Verilog testbench and its input data were generated.')
 
         print('Running build process of stitched IP...\n')
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 1301ecdcda..2e2a874f91 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -1026,6 +1026,7 @@ def make_multi_graph(cls, config, layer_list, input_layers, output_layers, outpu
 class MultiModelGraph:
     def __init__(self, graphs):
         self.graphs = graphs
+        self.nn_config = None
         self._initialize_config(graphs[0])
         self._bind_modelgraph_methods()
         self._initialize_io_attributes(graphs)
@@ -1162,13 +1163,12 @@ def build_wrapper(idx, g, **kwargs):
             vivado_folder = os.path.join(self.config.config['OutputDir'], self.config.config['StitchedProjectName'])
             if os.path.exists(vivado_folder):
                 raise FileExistsError(f"Vivado stitched project folder '{vivado_folder}' already exists.")
-            nn_config = self.parse_nn_config()
+            self.nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
                 self,
                 stitch_design=stitch_design,
                 sim_stitched_design=sim_stitched_design,
                 export_stitched_design=export_stitched_design,
-                nn_config=nn_config,
                 graph_reports=self.graph_reports,
             )
             return stitched_report
@@ -1178,6 +1178,7 @@ def build_wrapper(idx, g, **kwargs):
     def compile(self):
         for g in self.graphs:
             g.write()
+        self.nn_config = self.parse_nn_config()
         # Bypass VitisWriter and invoke write_hls directly from VivadoWriter
         super(self.backend.writer.__class__, self.backend.writer).write_hls(self, is_multigraph=True)
         self._compile()
@@ -1186,13 +1187,12 @@ def predict(self, x, sim='csim'):
         if sim == 'csim':
             return self._predict(x)
         elif sim == 'rtl':
-            nn_config = self.parse_nn_config()
+            self.nn_config = self.parse_nn_config()
             stitched_report = self.backend.build_stitched_design(
                 self,
                 stitch_design=False,
                 sim_stitched_design=True,
                 export_stitched_design=False,
-                nn_config=nn_config,
                 graph_reports=self.graph_reports,
                 simulation_input_data=x,
             )
@@ -1292,6 +1292,46 @@ def _make_stamp(self):
         stamp = uuid.uuid4()
         return str(stamp)[-length:]
 
+    def write_tb_inputs(self, x, folder_path):
+        """
+        Dump inputs (for Verilog testbench) via the C++ bridge functions:
+        dump_tb_inputs_float
+        dump_tb_inputs_double
+        """
+        if self._top_function_lib is None:
+            self._compile()
+
+        if isinstance(x, (list, tuple)):
+            xlist = list(x)
+        else:
+            xlist = [x]
+
+        first = xlist[0]
+        if first.dtype in [np.single, np.float32]:
+            fn_name = "dump_tb_inputs_float"
+            ctype = ctypes.c_float
+        elif first.dtype in [np.double, np.float64]:
+            fn_name = "dump_tb_inputs_double"
+            ctype = ctypes.c_double
+        else:
+            raise Exception(
+                'Invalid type ({}) of numpy array. Supported types are: single, float32, double, float64, float_.'.format(
+                    first.dtype
+                )
+            )
+
+        for arr in xlist:
+            if arr.dtype != first.dtype:
+                raise ValueError("All inputs must have same dtype")
+            if not arr.flags["C_CONTIGUOUS"]:
+                raise ValueError("Input arrays must be C_CONTIGUOUS")
+
+        fn = getattr(self._top_function_lib, fn_name)
+        fn.restype = None
+        fn.argtypes = [ctypes.c_char_p] + [npc.ndpointer(ctype, flags="C_CONTIGUOUS") for _ in xlist]
+
+        fn(folder_path.encode("ascii"), *xlist)
+
     def _replace_logos(self):
         spec = importlib.util.find_spec("hls4ml")
         hls4ml_path = os.path.dirname(spec.origin)
diff --git a/hls4ml/templates/vivado/myproject_bridge.cpp b/hls4ml/templates/vivado/myproject_bridge.cpp
index b1822a5ff6..8aa76a703b 100644
--- a/hls4ml/templates/vivado/myproject_bridge.cpp
+++ b/hls4ml/templates/vivado/myproject_bridge.cpp
@@ -48,6 +48,8 @@ void collect_trace_output(struct trace_data *c_trace_outputs) {
     }
 }
 
+// hls-fpga-machine-learning insert tb_input_writer
+
 // Wrapper of top level function for Python bridge
 void myproject_float(
     // hls-fpga-machine-learning insert header #float
diff --git a/hls4ml/utils/simulation_utils.py b/hls4ml/utils/simulation_utils.py
index a38be3babc..2684ad7a60 100644
--- a/hls4ml/utils/simulation_utils.py
+++ b/hls4ml/utils/simulation_utils.py
@@ -509,57 +509,34 @@ def write_verilog_testbench(nn_config, testbench_output_path):
         f.write('endmodule\n')
 
 
-def float_to_fixed(float_value, integer_bits=6, fractional_bits=10):
-    scaling_factor = 1 << fractional_bits
-    total_bits = integer_bits + fractional_bits
-    max_val = (1 << (total_bits - 1)) - 1
-    min_val = -(1 << (total_bits - 1))
-
-    float_value = float(float_value)  # Convert to Python float if it's a numpy type
-
-    fixed_value = int(np.round(float_value * scaling_factor))
-    fixed_value = max(min(fixed_value, max_val), min_val)
-
-    if fixed_value < 0:
-        fixed_value = fixed_value + (1 << total_bits)  # Two's complement
-
-    return fixed_value
-
-
-def write_testbench_input(float_inputs, file_name, integer_bits=6, fractional_bits=10):
-    """
-    Convert 1D or 2D arrays (or lists of floats) to fixed-point and write to file.
-
-    If 'float_inputs' is 1D: writes a single line.
-    If 'float_inputs' is 2D: flattens each row and writes one line per row.
-    """
-    with open(file_name, "w") as f:
-        if len(float_inputs) > 0 and isinstance(float_inputs[0], (list, np.ndarray)):
-            for row in float_inputs:
-                row_array = np.array(row).ravel()  # flatten if necessary
-                fixed_line = [float_to_fixed(val, integer_bits, fractional_bits) for val in row_array]
-                f.write(" ".join(map(str, fixed_line)) + "\n")
-        else:
-            flattened = np.array(float_inputs).ravel()  # ensure it's a flat array of scalars
-            fixed_line = [float_to_fixed(val, integer_bits, fractional_bits) for val in flattened]
-            f.write(" ".join(map(str, fixed_line)) + "\n")
-
-
-def prepare_zero_input(layer):
-    batch_size = layer['batch_size']
-    fifo_depth = layer['fifo_depth']
-    zero_input = np.zeros((fifo_depth, batch_size), dtype=np.int32)
-    return zero_input
-
-
-def prepare_testbench_input(data, fifo_depth, batch_size):
-    data_arr = np.array(data)
-    # Ensure that total elements = fifo_depth * batch_size
-    total_elements = fifo_depth * batch_size
-    if data_arr.size != total_elements:
-        raise ValueError(f"Data size {data_arr.size} does not match fifo_depth * batch_size = {total_elements}")
-    data_reshaped = data_arr.reshape((fifo_depth, batch_size))
-    return data_reshaped
+def prepare_zero_inputs(input_layers):
+    zero_list = [np.zeros((layer['fifo_depth'], layer['batch_size']), dtype=np.float32) for layer in input_layers]
+    return zero_list[0] if len(zero_list) == 1 else zero_list
+
+
+def prepare_tb_inputs(simulation_input_data, input_layers):
+    if simulation_input_data is None:
+        return prepare_zero_inputs(input_layers)
+
+    if isinstance(simulation_input_data, np.ndarray):
+        data_list = [simulation_input_data]
+    elif isinstance(simulation_input_data, (list, tuple)):
+        data_list = list(simulation_input_data)
+    else:
+        raise TypeError(f"simulation_input_data must be None, ndarray or list/tuple, got {type(simulation_input_data)}")
+
+    if len(data_list) != len(input_layers):
+        raise ValueError(f"Expected {len(input_layers)} input arrays, got {len(data_list)}.")
+
+    reshaped = []
+    for data, layer in zip(data_list, input_layers):
+        arr = np.asarray(data)
+        total = layer['fifo_depth'] * layer['batch_size']
+        if arr.size != total:
+            raise ValueError(f"Layer '{layer['name']}' has {arr.size} elements; expected {total}.")
+        reshaped.append(arr.reshape((layer['fifo_depth'], layer['batch_size'])))
+
+    return reshaped[0] if len(reshaped) == 1 else reshaped
 
 
 def read_testbench_log(testbench_log_path, outputs):
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index 9ef5503ab5..fdf400ac64 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -830,6 +830,70 @@ def write_bridge_multigraph(self, model):
                 if namespace is not None:
                     newline += indent + f'using namespace {namespace};\n'
 
+            elif '// hls-fpga-machine-learning insert tb_input_writer' in line:
+                funcs = [
+                    ("float", "dump_tb_inputs_float"),
+                    ("double", "dump_tb_inputs_double"),
+                ]
+                newline = ""
+                for dtype, funcname in funcs:
+                    newline += f'void {funcname}(\n'
+                    newline += '    const char* output_path'
+                    for inp in model_inputs:
+                        newline += f',\n    {dtype} {inp.name}[{inp.size_cpp()}]'
+                    newline += '\n) {\n\n'
+
+                    for inp in model_inputs:
+                        decl = inp.definition_cpp(name_suffix='_ap').strip()
+                        ap = inp.name + "_ap"
+                        if decl.startswith("hls::stream"):
+                            newline += f'    {decl};\n'
+                        else:
+                            newline += f'    {inp.type.name} {ap}[{inp.size_cpp()}];\n'
+                        newline += (
+                            f'    nnet::convert_data<{dtype}, {inp.type.name}, {inp.size_cpp()}>' f'({inp.name}, {ap});\n'
+                        )
+                    newline += "\n"
+                    newline += f'    std::ofstream fout(std::string(output_path) + "/{inp.name}_input_data.txt");\n'
+
+                    for inp in model_inputs:
+                        decl = inp.definition_cpp(name_suffix='_ap').strip()
+                        dims = inp.shape
+
+                        if decl.startswith("hls::stream"):
+                            if len(dims) == 1:
+                                N = dims[0]
+                                newline += f'    for(int i = 0; i < {N}; i++) {{\n'
+                                newline += f'        auto temp = {inp.name}_ap.read();\n'
+                                newline += (
+                                    f'        ap_uint<{inp.type.name}::value_type::width> bits = ' f'temp[0].range();\n'
+                                )
+                                newline += f'        fout << bits.to_uint()' f' << (i+1<{N} ? \' \' : \'\\n\');\n'
+                                newline += '    }\n'
+                            else:
+                                inputs_list = model.nn_config['inputs']
+                                fifo_depth = next((e['fifo_depth'] for e in inputs_list if e['name'] == inp.name), None)
+                                batch_size = next((e['batch_size'] for e in inputs_list if e['name'] == inp.name), None)
+                                newline += f'    for(int r = 0; r < {fifo_depth}; r++) {{\n'
+                                newline += f'        auto temp = {inp.name}_ap.read();\n'
+                                newline += f'        for(int c = 0; c < {batch_size}; c++) {{\n'
+                                newline += (
+                                    f'            ap_uint<{inp.type.name}::value_type::width> bits = ' f'temp[c].range();\n'
+                                )
+                                newline += (
+                                    f'            fout << bits.to_uint()' f' << (c+1<{batch_size} ? \' \' : \'\\n\');\n'
+                                )
+                                newline += '        }\n'
+                                newline += '    }\n'
+                        else:
+                            ap = inp.name + "_ap"
+                            N = inp.size_cpp()
+                            newline += f'    for(int i = 0; i < {N}; i++) {{\n'
+                            newline += f'        ap_uint<{inp.type.name}::width> bits = ' f'{ap}[i].range();\n'
+                            newline += f'        fout << bits.to_uint()' f' << (i+1<{N} ? \' \' : \'\\n\');\n'
+                            newline += '    }\n'
+                    newline += "    fout.close();\n"
+                    newline += "}\n"
             else:
                 newline = line
             fout.write(newline)
diff --git a/test/pytest/test_multi_graph.py b/test/pytest/test_multi_graph.py
index 8c33348bf9..540be798ec 100644
--- a/test/pytest/test_multi_graph.py
+++ b/test/pytest/test_multi_graph.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 import pytest
-import tensorflow as tf
-from tensorflow.keras.layers import Activation, Conv2D, Dense, Flatten, Input, MaxPooling2D
+from tensorflow.keras.layers import Activation, Dense, GlobalAveragePooling1D, Input
+from tensorflow.keras.models import Model
 
 import hls4ml
 
@@ -15,23 +15,26 @@ def create_test_model():
     This architecture ensures testing of corner cases such as:
     double layer outputs and variety of layers to serve as spliting points.
     """
-    inp = Input(shape=(4, 4, 3), name='input_layer')
-    x = Conv2D(4, (3, 3), padding='same', name='conv1')(inp)
+    inp = Input(shape=(6, 8), name='input_layer')
+    x = Dense(16, name='dense1')(inp)
     x = Activation('relu', name='relu1')(x)
-    x = MaxPooling2D((2, 2), name='pool1')(x)
-    x = Flatten(name='flatten')(x)
-    x = Dense(16, activation='relu', name='dense_common')(x)
-    output1 = Dense(5, activation='relu', name='dense1')(x)
-    output2 = Dense(5, activation='relu', name='dense2')(x)
-    model = tf.keras.Model(inputs=inp, outputs=[output1, output2])
-
+    x = Dense(8, name='dense2')(x)
+    x = Activation('relu', name='relu2')(x)
+    x = GlobalAveragePooling1D(name='avg_pool')(x)
+    x = Dense(16, name='dense_common')(x)
+    x = Activation('relu', name='relu_common')(x)
+    output1 = Dense(5, name='dense1_out')(x)
+    output1 = Activation('relu', name='relu_out1')(output1)
+    output2 = Dense(5, name='dense2_out')(x)
+    output2 = Activation('relu', name='relu_out2')(output2)
+    model = Model(inputs=inp, outputs=[output1, output2])
     return model
 
 
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
 @pytest.mark.parametrize('strategy', ['latency'])
 @pytest.mark.parametrize('granularity', ['model', 'name'])
-@pytest.mark.parametrize('split_layers', [('pool1', 'dense_common'), ('relu1', 'flatten')])
+@pytest.mark.parametrize('split_layers', [('dense2', 'avg_pool'), ('relu1', 'relu_common')])
 def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
     """
     Tests the multi-graph splitting and stitching process.
@@ -42,7 +45,7 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
     backend = 'vitis'
     model = create_test_model()
     model.compile(optimizer='adam', loss='categorical_crossentropy')
-    X_input = np.random.rand(5, 4, 4, 3).astype(np.float32)
+    X_input = np.random.rand(5, 6, 8).astype(np.float32)
 
     config = hls4ml.utils.config_from_keras_model(model, granularity=granularity, default_precision='ap_fixed<32,16>')
     config['Model']['Strategy'] = strategy
@@ -76,11 +79,6 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
         np.testing.assert_allclose(multi_out, mono_out, rtol=0, atol=1e-5)
 
     if granularity == 'name':
-        if io_type == 'io_parallel' and split_layers == ('relu1', 'flatten'):
-            pytest.skip(
-                "Skipping RTL simulation for io_parallel with split layer at flatten due to improper simulation behavior."
-            )
-
         # --- Optional: Build the HLS project and run simulation ---
         hls_model_multi.build(
             csim=False,
@@ -96,4 +94,4 @@ def test_multimodelgraph_predict(split_layers, io_type, strategy, granularity):
         inp = np.expand_dims(X_input[0], axis=0)
         sim_results = hls_model_multi.predict(inp, sim='rtl')
         for sim_out, pred_out in zip(sim_results, list([pred_multi[0][0], pred_multi[1][0]])):
-            np.testing.assert_allclose(sim_out, pred_out, rtol=0, atol=0.3)
+            np.testing.assert_allclose(sim_out, pred_out, rtol=0, atol=1e-5)