From 608c136d22142cbf0f1dde4e1a739bc225418c75 Mon Sep 17 00:00:00 2001
From: Vladimir Loncar <vloncar@users.noreply.github.com>
Date: Fri, 21 Mar 2025 19:54:05 +0100
Subject: [PATCH] Initial implementation of Libero backend for PolarFire line
 of FPGAs

---
 hls4ml/backends/__init__.py                   |   2 +
 hls4ml/backends/libero/__init__.py            |   0
 hls4ml/backends/libero/libero_backend.py      | 192 ++++
 hls4ml/backends/libero/libero_types.py        | 125 +++
 .../backends/libero/passes/core_templates.py  | 152 ++++
 .../backends/libero/passes/pipeline_style.py  | 101 +++
 .../backends/libero/passes/transform_types.py |  47 +
 hls4ml/report/__init__.py                     |   1 +
 hls4ml/report/libero_report.py                | 117 +++
 hls4ml/templates/libero/build_lib.sh          |  20 +
 hls4ml/templates/libero/firmware/defines.h    |  19 +
 .../templates/libero/firmware/myproject.cpp   |  19 +
 hls4ml/templates/libero/firmware/myproject.h  |  19 +
 hls4ml/templates/libero/firmware/parameters.h |  19 +
 hls4ml/templates/libero/myproject_bridge.cpp  |  69 ++
 hls4ml/templates/libero/myproject_test.cpp    |  96 ++
 .../libero/nnet_utils/nnet_activation.h       | 800 +++++++++++++++++
 .../nnet_utils/nnet_activation_stream.h       | 800 +++++++++++++++++
 .../libero/nnet_utils/nnet_code_gen.h         |  28 +
 .../templates/libero/nnet_utils/nnet_common.h |  65 ++
 .../libero/nnet_utils/nnet_conv1d_latency.h   | 167 ++++
 .../templates/libero/nnet_utils/nnet_dense.h  |  82 ++
 .../libero/nnet_utils/nnet_dense_compressed.h |  89 ++
 .../libero/nnet_utils/nnet_dense_latency.h    |  72 ++
 .../libero/nnet_utils/nnet_dense_resource.h   | 270 ++++++
 .../libero/nnet_utils/nnet_dense_stream.h     | 105 +++
 .../libero/nnet_utils/nnet_function_stubs.h   |  51 ++
 .../libero/nnet_utils/nnet_helpers.h          | 279 ++++++
 .../templates/libero/nnet_utils/nnet_mult.h   | 118 +++
 .../templates/libero/nnet_utils/nnet_stream.h | 223 +++++
 .../templates/libero/nnet_utils/nnet_types.h  |  66 ++
 hls4ml/writer/__init__.py                     |   2 +
 hls4ml/writer/libero_writer.py                | 844 ++++++++++++++++++
 33 files changed, 5059 insertions(+)
 create mode 100644 hls4ml/backends/libero/__init__.py
 create mode 100644 hls4ml/backends/libero/libero_backend.py
 create mode 100644 hls4ml/backends/libero/libero_types.py
 create mode 100644 hls4ml/backends/libero/passes/core_templates.py
 create mode 100644 hls4ml/backends/libero/passes/pipeline_style.py
 create mode 100644 hls4ml/backends/libero/passes/transform_types.py
 create mode 100644 hls4ml/report/libero_report.py
 create mode 100644 hls4ml/templates/libero/build_lib.sh
 create mode 100644 hls4ml/templates/libero/firmware/defines.h
 create mode 100644 hls4ml/templates/libero/firmware/myproject.cpp
 create mode 100644 hls4ml/templates/libero/firmware/myproject.h
 create mode 100644 hls4ml/templates/libero/firmware/parameters.h
 create mode 100644 hls4ml/templates/libero/myproject_bridge.cpp
 create mode 100644 hls4ml/templates/libero/myproject_test.cpp
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_activation.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_code_gen.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_common.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_helpers.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_mult.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_stream.h
 create mode 100644 hls4ml/templates/libero/nnet_utils/nnet_types.h
 create mode 100644 hls4ml/writer/libero_writer.py

diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 4a48f072cd..ea1b53b392 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -1,5 +1,6 @@
 from hls4ml.backends.backend import Backend, get_available_backends, get_backend, register_backend  # noqa: F401
 from hls4ml.backends.fpga.fpga_backend import FPGABackend  # noqa: F401
+from hls4ml.backends.libero.libero_backend import LiberoBackend
 from hls4ml.backends.oneapi.oneapi_backend import OneAPIBackend
 from hls4ml.backends.quartus.quartus_backend import QuartusBackend
 from hls4ml.backends.symbolic.symbolic_backend import SymbolicExpressionBackend
@@ -18,3 +19,4 @@
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
 register_backend('oneAPI', OneAPIBackend)
+register_backend('Libero', LiberoBackend)
diff --git a/hls4ml/backends/libero/__init__.py b/hls4ml/backends/libero/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/backends/libero/libero_backend.py b/hls4ml/backends/libero/libero_backend.py
new file mode 100644
index 0000000000..a17d40af12
--- /dev/null
+++ b/hls4ml/backends/libero/libero_backend.py
@@ -0,0 +1,192 @@
+import os
+import subprocess
+import sys
+
+from hls4ml.backends import FPGABackend
+from hls4ml.model.attributes import ChoiceAttribute
+from hls4ml.model.flow import register_flow
+from hls4ml.model.layers import Dense, Layer
+from hls4ml.model.optimizer import layer_optimizer
+from hls4ml.report import parse_libero_report
+
+
+class LiberoBackend(FPGABackend):
+    def __init__(self):
+        super().__init__(name='Libero')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def _register_layer_attributes(self):
+        strategy_layers = [
+            Dense,
+        ]
+
+        for layer in strategy_layers:
+            attrs = self.attribute_map.get(layer, [])
+            attrs.append(
+                ChoiceAttribute(
+                    'strategy',
+                    choices=['Latency', 'Resource'],
+                    default='Latency',
+                )
+            )
+            self.attribute_map[layer] = attrs
+
+    def _register_flows(self):
+        initializers = self._get_layer_initializers()
+        init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
+
+        libero_types = [
+            'libero:transform_types',
+            'libero:set_pipeline_style',
+        ]
+        libero_types_flow = register_flow('specific_types', libero_types, requires=[init_flow], backend=self.name)
+
+        template_flow = register_flow('apply_templates', self._get_layer_templates, requires=[init_flow], backend=self.name)
+
+        writer_passes = ['make_stamp', 'libero:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['libero:ip'], backend=self.name)
+
+        ip_flow_requirements = [
+            'optimize',
+            init_flow,
+            libero_types_flow,
+            template_flow,
+        ]
+
+        self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def create_initial_config(
+        self,
+        fpga_family='PolarFire',
+        part='MPF300',
+        board='hw_only',
+        clock_period=5,
+        clock_uncertainty='27%',
+        io_type='io_parallel',
+        namespace=None,
+        write_weights_txt=True,
+        write_tar=False,
+        **_,
+    ):
+        """Create initial configuration of the Libero backend.
+
+        Args:
+            part (str, optional): The FPGA part to be used. Defaults to 'MPF300'.
+            clock_period (int, optional): The clock period. Defaults to 5.
+            clock_uncertainty (str, optional): The clock uncertainty. Defaults to 27%.
+            io_type (str, optional): Type of implementation used. One of
+                'io_parallel' or 'io_stream'. Defaults to 'io_parallel'.
+            namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None.
+            write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation.
+                Defaults to True.
+            write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False.
+
+        Returns:
+            dict: initial configuration.
+        """
+        config = {}
+
+        config['FPGAFamily'] = fpga_family if fpga_family is not None else 'PolarFire'
+        config['Part'] = part if part is not None else 'MPF300'
+        config['Board'] = board if board is not None else 'hw_only'
+        config['ClockPeriod'] = clock_period if clock_period is not None else 5
+        config['IOType'] = io_type if io_type is not None else 'io_parallel'
+        config['HLSConfig'] = {}
+        config['WriterConfig'] = {
+            'Namespace': namespace,
+            'WriteWeightsTxt': write_weights_txt,
+            'WriteTar': write_tar,
+        }
+
+        return config
+
+    def build(
+        self,
+        model,
+        reset=False,
+        skip_preqs=False,
+        sw_compile=True,
+        hw=True,
+        cosim=False,
+        rtl_synth=False,
+        fpga=False,
+        **kwargs,
+    ):
+        """Build the model using Libero suite and SmartHLS compiler. Additional arguments passed to the function in form of
+        `<arg>=True` will be passed as an argument to the `shls` command. See SmartHLS user guide for list of possible
+        command line options.
+
+        Args:
+            model (ModelGraph): Model to build
+            reset (bool, optional): Clean up any existing files. Defaults to False.
+            skip_preqs(bool, optional): Skip any prerequisite step that is outdated. Defaults to False.
+            sw_compile (bool, optional): Compile the generated HLS in software. Defaults to True.
+            hw (bool, optional): Compile the software to hardware, producing a set of Verilog HDL files. Defaults to True.
+            cosim (bool, optional): Run co-simulation. Defaults to False.
+            rtl_synth (bool, optional): Run RTL synthesis for resource results. This will take less time than `fpga`.
+                Defaults to False.
+            fpga (bool, optional): Synthesize the generated hardware to target FPGA. This runs RTL synthesis and
+                place-and-route for resource and timing results. Defaults to False.
+
+        Raises:
+            Exception: Raised if the `shls` command has not been found
+            CalledProcessError: Raised if SmartHLS returns non-zero code for any of the commands executed
+
+        Returns:
+            dict: Detailed report produced by SmartHLS.
+        """
+        if 'linux' in sys.platform:
+            found = os.system('command -v shls > /dev/null')
+            if found != 0:
+                raise Exception('Libero/SmartHLS installation not found. Make sure "shls" is on PATH.')
+
+        def run_shls_cmd(cmd_name):
+            subprocess.run(
+                ['shls', '-s', cmd_name],
+                shell=False,
+                check=True,
+                stdout=sys.stdout,
+                stderr=sys.stderr,
+                cwd=model.config.get_output_dir(),
+            )
+
+        if reset:
+            run_shls_cmd('clean')
+        if sw_compile:
+            run_shls_cmd('sw_compile')
+        if hw:
+            run_shls_cmd('hw')
+        if cosim:
+            run_shls_cmd('cosim')
+        if rtl_synth:
+            run_shls_cmd('rtl_synth')
+        if fpga:
+            run_shls_cmd('fpga')
+
+        for arg_name, arg_val in kwargs.items():
+            if arg_val:
+                run_shls_cmd(arg_name)
+
+        return parse_libero_report(model.config.get_output_dir())
+
+    @layer_optimizer(Layer)
+    def init_base_layer(self, layer):
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('reuse_factor', reuse_factor)
+
+    @layer_optimizer(Dense)
+    def init_dense(self, layer):
+        if layer.model.config.is_resource_strategy(layer):
+            n_in, n_out = self.get_layer_mult_size(layer)
+            self.set_target_reuse_factor(layer)
+            self.set_closest_reuse_factor(layer, n_in, n_out)
+            layer.set_attr('strategy', 'resource')
+        else:
+            layer.set_attr('strategy', 'latency')
diff --git a/hls4ml/backends/libero/libero_types.py b/hls4ml/backends/libero/libero_types.py
new file mode 100644
index 0000000000..9ce0364de8
--- /dev/null
+++ b/hls4ml/backends/libero/libero_types.py
@@ -0,0 +1,125 @@
+from hls4ml.backends.fpga.fpga_types import (
+    ArrayVariableConverter,
+    ExponentPrecisionType,
+    FixedPrecisionConverter,
+    FixedPrecisionType,
+    InplaceStreamVariableConverter,
+    IntegerPrecisionType,
+    PrecisionDefinition,
+    StreamVariableConverter,
+    VariableDefinition,
+    XnorPrecisionType,
+)
+
+# region ArrayVariable
+
+
+class LiberoArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        return '{type} {name}{suffix}[{shape}]'.format(
+            type=self.type.name, name=self.name, suffix=name_suffix, shape=self.size_cpp()
+        )
+
+
+class LiberoInplaceArrayVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class LiberoArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoArrayVariableDefinition)
+
+
+class LiberoInplaceArrayVariableConverter(ArrayVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoInplaceArrayVariableDefinition)
+
+
+# endregion
+
+# region StreamVariable
+
+
+class LiberoStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference:  # Function parameter
+            return f'hls::FIFO<{self.type.name}> &{self.name}{name_suffix}'
+        else:  # Declaration
+            return 'hls::FIFO<{type}> {name}{suffix}({depth})'.format(
+                type=self.type.name, name=self.name, depth=self.pragma[1], suffix=name_suffix
+            )
+
+
+class LiberoInplaceStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self):
+        return f'auto& {self.name} = {self.input_var.name}'
+
+
+class LiberoStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Libero', definition_cls=LiberoStreamVariableDefinition)
+
+
+# endregion
+
+# region InplaceStreamVariable
+
+
+class LiberoInplaceStreamVariableConverter(InplaceStreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(
+            type_converter=type_converter, prefix='Libero', definition_cls=LiberoInplaceStreamVariableDefinition
+        )
+
+
+# endregion
+
+# region Precision types
+
+
+class LAPIntegerPrecisionDefinition(PrecisionDefinition):
+    def definition_cpp(self):
+        typestring = 'hls::ap_{signed}int<{width}>'.format(signed='u' if not self.signed else '', width=self.width)
+        return typestring
+
+
+class LAPFixedPrecisionDefinition(PrecisionDefinition):
+    def _rounding_mode_cpp(self, mode):
+        if mode is not None:
+            return 'AP_' + str(mode)
+
+    def _saturation_mode_cpp(self, mode):
+        if mode is not None:
+            return 'AP_' + str(mode)
+
+    def definition_cpp(self):
+        args = [
+            self.width,
+            self.integer,
+            self._rounding_mode_cpp(self.rounding_mode),
+            self._saturation_mode_cpp(self.saturation_mode),
+        ]
+        if args[2] == 'AP_TRN' and args[3] == 'AP_WRAP':
+            # This is the default, so we won't write the full definition for brevity
+            args[2] = args[3] = None
+
+        args = ','.join([str(arg) for arg in args if arg is not None])
+        typestring = 'hls::ap_{signed}fixpt<{args}>'.format(signed='u' if not self.signed else '', args=args)
+        return typestring
+
+
+class LAPTypeConverter(FixedPrecisionConverter):
+    def __init__(self):
+        super().__init__(
+            type_map={
+                FixedPrecisionType: LAPFixedPrecisionDefinition,
+                IntegerPrecisionType: LAPIntegerPrecisionDefinition,
+                ExponentPrecisionType: LAPIntegerPrecisionDefinition,
+                XnorPrecisionType: LAPIntegerPrecisionDefinition,
+            },
+            prefix='LAP',
+        )
+
+
+# endregion
diff --git a/hls4ml/backends/libero/passes/core_templates.py b/hls4ml/backends/libero/passes/core_templates.py
new file mode 100644
index 0000000000..c3ac18d17c
--- /dev/null
+++ b/hls4ml/backends/libero/passes/core_templates.py
@@ -0,0 +1,152 @@
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Activation, BatchNormalization, Dense
+
+# Dense templates
+
+dense_config_template = """struct config{index} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in * n_out, reuse_factor) - n_zeros / reuse_factor;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+dense_function_template = 'nnet::{dense_function}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
+
+
+class DenseConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Dense)
+        self.template = dense_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['nzeros'] = node.get_weights('weight').nzeros
+        params['nonzeros'] = node.get_weights('weight').nonzeros
+        params['product_type'] = get_backend('libero').product_type(
+            node.get_input_variable().type.precision, node.get_weights('weight').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class DenseFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Dense, include_header=dense_include_list)
+        self.template = dense_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
+
+        if node.get_attr('strategy').lower() == 'latency':
+            params['dense_function'] = 'dense_latency'
+        elif node.get_attr('strategy').lower() == 'resource':
+            if int(params['reuse_factor']) <= int(params['n_in']):
+                params['dense_function'] = 'dense_resource_rf_leq_nin'
+            else:
+                params['dense_function'] = 'dense_resource_rf_gt_nin_rem0'
+            # The 3rd case is never used
+
+        return self.template.format(**params)
+
+
+# BatchNormalization templates
+
+batchnorm_config_template = """struct config{index} : nnet::batchnorm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_filt = {n_filt};
+    static const unsigned n_scale_bias = (n_filt == -1) ? n_in : n_filt;
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned multiplier_limit = DIV_ROUNDUP(n_in, reuse_factor);
+    static const bool store_weights_in_bram = false;
+    typedef {bias_t.name} bias_t;
+    typedef {scale_t.name} scale_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
+
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
+
+
+class BatchNormalizationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization)
+        self.template = batchnorm_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['product_type'] = get_backend('libero').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
+
+        return self.template.format(**params)
+
+
+class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(BatchNormalization, include_header=batchnorm_include_list)
+        self.template = batchnorm_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# Activation templates
+
+activ_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    typedef {table_t.name} table_t;
+}};\n"""
+
+activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
+
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
+
+
+class ActivationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Activation)
+        self.template = activ_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+
+        return self.template.format(**params)
+
+
+class ActivationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__((Activation), include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['activation'] = node.get_attr('activation').lower()
+        params['config'] = '{}_config{}'.format(node.get_attr('activation'), node.index)
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/libero/passes/pipeline_style.py b/hls4ml/backends/libero/passes/pipeline_style.py
new file mode 100644
index 0000000000..4d4ce38f55
--- /dev/null
+++ b/hls4ml/backends/libero/passes/pipeline_style.py
@@ -0,0 +1,101 @@
+from hls4ml.model.layers import Conv1D, Conv2D
+from hls4ml.model.optimizer import ModelOptimizerPass
+
+
+class SetPipelineStyle(ModelOptimizerPass):
+    def __init__(self):
+        pass
+
+    def transform(self, model):
+        if model.config.pipeline_style not in ['auto', 'pipeline', 'dataflow']:
+            print(
+                f'WARNING: Pipeline style set to {model.config.pipeline_style}, valid values: auto, pipeline, dataflow. '
+                'Using "auto".'
+            )
+            self._set_pipeline_style(model, 'auto')
+
+        if model.config.pipeline_style is None or model.config.pipeline_style == 'auto':
+
+            if self._maybe_set_dataflow_io_stream(model):
+                return True
+
+            if self._maybe_set_dataflow_conv_layers(model):
+                return True
+
+            if self._maybe_set_dataflow_resource_strategy(model):
+                return True
+
+            if self._maybe_set_pipeline_io_parallel(model):
+                return True
+
+            self._set_safe_default_dataflow(model)
+            return True
+        else:
+            self._validate_hls_config(model)
+
+        return False  # No model changes made
+
+    def _set_pipeline_style(self, model, pipeline_style):
+        # Could add logging here
+        model.config.pipeline_style = pipeline_style
+
+    def _maybe_set_dataflow_io_stream(self, model):
+        if model.config.get_config_value('IOType') == 'io_stream':
+            self._set_pipeline_style(model, 'dataflow')
+            return True
+
+        return False
+
+    def _maybe_set_dataflow_conv_layers(self, model):
+        for layer in model.get_layers():
+            if isinstance(layer, (Conv1D, Conv2D)):
+                self._set_pipeline_style(model, 'dataflow')
+                return True
+
+        return False
+
+    def _maybe_set_dataflow_resource_strategy(self, model):
+        for layer in model.get_layers():
+            if model.config.is_resource_strategy(layer):
+                self._set_pipeline_style(model, 'dataflow')
+                return True
+
+        return False
+
+    def _maybe_set_pipeline_io_parallel(self, model):
+        if model.config.get_config_value('IOType') == 'io_parallel':
+            self._set_pipeline_style(model, 'pipeline')
+            return True
+
+        return False
+
+    def _set_safe_default_dataflow(self, model):
+        print(
+            'WARNING: Couldn\'t determine best pipeline style, defaulting to "DATAFLOW". '
+            'Use "PipelineStyle" property to override.'
+        )
+        self._set_pipeline_style(model, 'dataflow')
+
+    def _validate_hls_config(self, model):
+        if model.config.pipeline_style.lower() == 'pipeline':
+            if model.config.model_strategy.lower() == 'resource':
+                print(
+                    'WARNING: Model strategy "Resource" will lead to bad QoR in combination '
+                    'with pipeline style set to "pipeline".'
+                )
+            if any(isinstance(layer, (Conv1D, Conv2D)) for layer in model.get_layers()):
+                print('WARNING: Convolution layers require "dataflow" pipeline style.')
+        for layer_type, strategy in model.config.layer_type_strategy.items():
+            if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline':
+                print(
+                    f'WARNING: Strategy for layer type {layer_type} set to "Resource", while pipeline style set to '
+                    '"pipeline". This will lead to bad QoR.'
+                )
+
+        for layer_name, strategy in model.config.layer_name_strategy.items():
+            if strategy.lower() == 'resource' and model.config.pipeline_style.lower() == 'pipeline':
+                print(
+                    'WARNING: Strategy for layer {} set to "Resource", while pipeline style set to "pipeline".'.format(
+                        layer_name
+                    )
+                )
diff --git a/hls4ml/backends/libero/passes/transform_types.py b/hls4ml/backends/libero/passes/transform_types.py
new file mode 100644
index 0000000000..7271e24c73
--- /dev/null
+++ b/hls4ml/backends/libero/passes/transform_types.py
@@ -0,0 +1,47 @@
+from hls4ml.backends.fpga.fpga_types import HLSTypeConverter, StaticWeightVariableConverter
+from hls4ml.backends.libero.libero_types import (
+    LAPTypeConverter,
+    LiberoArrayVariableConverter,
+    LiberoInplaceArrayVariableConverter,
+    LiberoInplaceStreamVariableConverter,
+    LiberoStreamVariableConverter,
+)
+from hls4ml.model.optimizer import GlobalOptimizerPass
+from hls4ml.model.types import InplaceTensorVariable
+
+
+class TransformTypes(GlobalOptimizerPass):
+    def __init__(self):
+        self.type_converter = HLSTypeConverter(precision_converter=LAPTypeConverter())
+        self.array_var_converter = LiberoArrayVariableConverter(type_converter=self.type_converter)
+        self.inplace_array_var_converter = LiberoInplaceArrayVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = LiberoStreamVariableConverter(type_converter=self.type_converter)
+        self.inplace_stream_var_converter = LiberoInplaceStreamVariableConverter(type_converter=self.type_converter)
+        self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
+
+    def transform(self, model, node):
+        io_type = node.model.config.get_config_value('IOType')
+
+        for out_name, var in node.variables.items():
+            if io_type == 'io_stream':
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_stream_var_converter.convert(var)
+                else:
+                    new_var = self.stream_var_converter.convert(var)
+            elif io_type == 'io_parallel':
+                if isinstance(var, InplaceTensorVariable):
+                    new_var = self.inplace_array_var_converter.convert(var, pragma='')
+                else:
+                    new_var = self.array_var_converter.convert(var, pragma='partition')
+            else:
+                raise Exception(f'Unknown IOType {io_type} in {node.name} ({node.class_name})')
+
+            node.set_attr(out_name, new_var)
+
+        for w_name, weight in node.weights.items():
+            new_weight = self.weight_var_converter.convert(weight)
+            node.set_attr(w_name, new_weight)
+
+        for t_name, type in node.types.items():
+            new_type = self.type_converter.convert(type)
+            node.set_attr(t_name, new_type)
diff --git a/hls4ml/report/__init__.py b/hls4ml/report/__init__.py
index 3c9b7707b7..c24207eda0 100644
--- a/hls4ml/report/__init__.py
+++ b/hls4ml/report/__init__.py
@@ -1,6 +1,7 @@
 from hls4ml.report.catapult_report import parse_catapult_report  # noqa: F401
 from hls4ml.report.catapult_report import qofr  # noqa: F401
 from hls4ml.report.catapult_report import read_catapult_report  # noqa: F401
+from hls4ml.report.libero_report import parse_libero_report  # noqa: F401
 from hls4ml.report.quartus_report import parse_quartus_report  # noqa: F401
 from hls4ml.report.quartus_report import read_quartus_report  # noqa: F401
 from hls4ml.report.vivado_report import parse_vivado_report  # noqa: F401
diff --git a/hls4ml/report/libero_report.py b/hls4ml/report/libero_report.py
new file mode 100644
index 0000000000..4591d8f338
--- /dev/null
+++ b/hls4ml/report/libero_report.py
@@ -0,0 +1,117 @@
+from pathlib import Path
+
+
+def parse_libero_report(out_dir):
+    """Reads and parses an FPGA synthesis report into a structured dictionary."""
+
+    out_path = Path(out_dir)
+    report_path = out_path / 'hls_output/reports/summary.results.rpt'
+    if not report_path.exists():
+        print(f'Libero report file {str(report_path)} not found.')
+        return {}
+
+    with open(report_path) as file:
+        report_lines = file.readlines()
+
+    return {
+        'Simulation Result': _parse_sim_data(_extract_section(report_lines, '====== 1. Simulation Result')),
+        'Timing Result': _parse_timing_data(_extract_section(report_lines, '====== 2. Timing Result')),
+        'Resource Usage': parse_utilization_data(_extract_section(report_lines, '====== 3. Resource Usage')),
+    }
+
+
+def _extract_section(lines, section_header):
+    """Extracts table data from a given section in the report."""
+
+    section_data = []
+    in_section = False
+
+    for line in lines:
+        if section_header in line:
+            in_section = True
+            continue
+
+        if in_section:
+            if line.startswith('======'):  # Start of next section
+                break
+            section_data.append(line.strip())
+
+    return section_data
+
+
+def _parse_sim_data(data):
+    """Parses 'Simulation Result' section."""
+    if len(data) == 0:
+        return {'Error': 'Data missing for this section'}
+
+    sim_dict = {}
+
+    for line in data:
+        if line.startswith('N/A. Please run'):
+            return {'Error': line}
+        elif line.startswith('+') or line.startswith('| Top-Level Name'):
+            continue  # Ignore table borders
+        elif '|' in line:
+            columns = [col.strip() for col in line.split('|')[2:-1]]
+            sim_dict.update(
+                {
+                    'Number of calls': columns[0],
+                    'Simulation time (cycles)': columns[1],
+                    'Call Latency (min/max/avg)': columns[2],
+                    'Call II (min/max/avg)': columns[3],
+                }
+            )
+        elif 'SW/HW co-simulation' in line:
+            sim_dict['Status'] = line.split(':')[1].strip()
+
+    return sim_dict
+
+
+def _parse_timing_data(data):
+    """Parses 'Timing Result' section."""
+    if len(data) == 0:
+        return {'Error': 'Data missing for this section'}
+
+    timing_dict = {}
+
+    for line in data:
+        if line.startswith('N/A. Please run'):
+            return {'Error': line}
+        elif line.startswith('+') or line.startswith('| Clock Domain'):
+            continue  # Ignore table borders
+        elif '|' in line:
+            columns = [col.strip() for col in line.split('|')[2:-1]]
+            timing_dict.update(
+                {
+                    'Target Period': columns[0],
+                    'Target Fmax': columns[1],
+                    'Worst Slack': columns[2],
+                    'Period': columns[3],
+                    'Fmax': columns[4],
+                }
+            )
+
+    return timing_dict
+
+
+def parse_utilization_data(data):
+    """Parses 'Resource Usage' section."""
+    if len(data) == 0:
+        return {'Error': 'Data missing for this section'}
+
+    util_dict = {}
+
+    for line in data:
+        if line.startswith('N/A. Please run'):
+            return {'Error': line}
+        elif line.startswith('+') or line.startswith('| Resource Type'):
+            continue  # Ignore table borders
+        elif '|' in line:
+            columns = [col.strip() for col in line.split('|')[1:-1]]
+            util_dict[columns[0]] = {
+                'Used': columns[1],
+                'Total': columns[2],
+                'Percentage': columns[3],
+            }
+
+    return util_dict
diff --git a/hls4ml/templates/libero/build_lib.sh b/hls4ml/templates/libero/build_lib.sh
new file mode 100644
index 0000000000..a0860a462e
--- /dev/null
+++ b/hls4ml/templates/libero/build_lib.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC"
+fi
+LDFLAGS=
+INCFLAGS="-I/opt/microchip/Libero_SoC_v2024.2/SmartHLS-2024.2/SmartHLS/smarthls-library/hls"
+PROJECT=myproject
+LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && pwd)"
+WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\""
+
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/libero/firmware/defines.h b/hls4ml/templates/libero/firmware/defines.h
new file mode 100644
index 0000000000..fcdf1e080d
--- /dev/null
+++ b/hls4ml/templates/libero/firmware/defines.h
@@ -0,0 +1,19 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+#include "nnet_utils/nnet_types.h"
+#include <cstddef>
+#include <cstdio>
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+#include <hls/streaming.hpp>
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert namespace-start
+
+// hls-fpga-machine-learning insert layer-precision
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/libero/firmware/myproject.cpp b/hls4ml/templates/libero/firmware/myproject.cpp
new file mode 100644
index 0000000000..c231adb47a
--- /dev/null
+++ b/hls4ml/templates/libero/firmware/myproject.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+void myproject(
+    // hls-fpga-machine-learning insert header
+) {
+    #pragma HLS function top
+    // hls-fpga-machine-learning insert IO
+
+    // hls-fpga-machine-learning insert load weights
+
+    // hls-fpga-machine-learning insert layers
+}
+
+// hls-fpga-machine-learning insert namespace-end
diff --git a/hls4ml/templates/libero/firmware/myproject.h b/hls4ml/templates/libero/firmware/myproject.h
new file mode 100644
index 0000000000..0412ddc2d8
--- /dev/null
+++ b/hls4ml/templates/libero/firmware/myproject.h
@@ -0,0 +1,19 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+#include <hls/streaming.hpp>
+
+#include "defines.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+// Prototype of top level function for C-synthesis
+void myproject(
+    // hls-fpga-machine-learning insert header
+);
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/libero/firmware/parameters.h b/hls4ml/templates/libero/firmware/parameters.h
new file mode 100644
index 0000000000..feee517633
--- /dev/null
+++ b/hls4ml/templates/libero/firmware/parameters.h
@@ -0,0 +1,19 @@
+#ifndef PARAMETERS_H_
+#define PARAMETERS_H_
+
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+
+#include "nnet_utils/nnet_code_gen.h"
+#include "nnet_utils/nnet_helpers.h"
+// hls-fpga-machine-learning insert includes
+
+// hls-fpga-machine-learning insert weights
+
+// hls-fpga-machine-learning insert namespace-start
+
+// hls-fpga-machine-learning insert layer-config
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/libero/myproject_bridge.cpp b/hls4ml/templates/libero/myproject_bridge.cpp
new file mode 100644
index 0000000000..b1822a5ff6
--- /dev/null
+++ b/hls4ml/templates/libero/myproject_bridge.cpp
@@ -0,0 +1,69 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif
diff --git a/hls4ml/templates/libero/myproject_test.cpp b/hls4ml/templates/libero/myproject_test.cpp
new file mode 100644
index 0000000000..2dc963906e
--- /dev/null
+++ b/hls4ml/templates/libero/myproject_test.cpp
@@ -0,0 +1,96 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // hls-fpga-machine-learning insert namespace
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+        const unsigned NUM_TEST_SAMPLES = 5;
+        for (unsigned i = 0; i < NUM_TEST_SAMPLES; i++) {
+            // hls-fpga-machine-learning insert zero
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_activation.h b/hls4ml/templates/libero/nnet_utils/nnet_activation.h
new file mode 100644
index 0000000000..014cd9570d
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_activation.h
@@ -0,0 +1,800 @@
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "nnet_common.h"
+#include <cmath>
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef hls::ap_fixpt<18, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS loop pipeline
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = data[ii];
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    #pragma HLS loop pipeline
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
+
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default logistic sigmoid function:
+    //   result = 1/(1+e^(-x))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+inline float exp_fcn_float(float input) { return std::exp(input); }
+
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+    // Treat the index as the top N bits
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    data_T x(0);
+    x(x.width - 1, x.width - N) = i;
+    return (float)x;
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    hls::ap_uint<N> y = x(x.width - 1, x.width - N);         // slice the top N bits of input
+    return (unsigned)y(N - 1, 0);
+}
+
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        // Slicing bits for address is going to round towards 0, so take the central value
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
+        table_out[i] = exp_x;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::inv_table_t inv_x = 1 / x;
+        table_out[i] = inv_x;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS function pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Calculate all the e^x's
+    #pragma HLS memory partition variable(exp_res) type(complete)
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS function pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Find the max and compute all delta(x_i, x_max)
+    Op_max<data_T> op_max;
+    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    hls::ap_fixpt<data_T::width, data_T::_I_W, hls::AP_RND, hls::AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    #pragma HLS memory partition variable(exp_res) type(complete)
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Inversion function:
+    //   result = 1/x
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
+        float in_val = 64.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
+    data_T data_cache[CONFIG_T::n_in];
+    int data_round;
+    uint index;
+    #pragma HLS loop pipeline
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_cache[ii] = data[ii];
+        exp_res[ii] = 0;
+    }
+
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+        for (uint jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj)
+                exp_diff_res = 1;
+            else {
+                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
+                index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                exp_diff_res = exp_table[index];
+            }
+            exp_res[ii] += exp_diff_res;
+        }
+    }
+
+    // Second loop to invert
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+        uint exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+        res[ii] = (res_T)invert_table[exp_res_index];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS loop unroll
+    for (uint i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = (res_T)0;
+    }
+
+    data_T maximum = data[0];
+    int idx = 0;
+
+    #pragma HLS loop pipeline
+    for (uint i = 1; i < CONFIG_T::n_in; i++) {
+        if (data[i] > maximum) {
+            maximum = data[i];
+            idx = i;
+        }
+    }
+
+    res[idx] = (res_T)1;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS function inline
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Implement tanh lookup
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = std::tanh(in_val);
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 8;
+        index = data_round + 4 * CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
+    }
+}
+
+// *************************************************
+//       UnaryLUT Activation
+// *************************************************
+template <int table_size, class data_T> inline unsigned get_index_unary_lut(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(table_size);
+    return (unsigned)(x(x.width - 1, 0));
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
+    #pragma HLS memory partition argument(table)
+    //#pragma HLS function_instantiate variable=table
+
+    #pragma HLS loop unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        unsigned index = get_index_unary_lut<CONFIG_T::table_size>(data[ii]);
+        res[ii] = (res_T)table[index];
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS loop pipeline
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    if (CONFIG_T::io_type == io_parallel) {
+        #pragma HLS loop pipeline
+        for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+            auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            res[ii] = 2 * sigmoid - 1;
+        }
+    } else {
+        for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+            auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            res[ii] = 2 * sigmoid - 1;
+        }
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], param_T alpha, res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], param_T theta, res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softplus function:
+    //   result = log(exp(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softsign function:
+    //   result = x / (abs(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
+
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default ELU function:
+    //   result = alpha * (e^(x) - 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const param_T alpha, res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, hls::ap_uint<1>, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+inline float selu_fcn_float(float input) {
+    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default SELU function:
+    //   result = 1.05 * (1.673 * (e^(x) - 1))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], param_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    res_T cache;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+
+    data_T datareg;
+    res_T cache;
+    #pragma HLS loop pipeline
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h
new file mode 100644
index 0000000000..7ee7ffa516
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_activation_stream.h
@@ -0,0 +1,800 @@
+#ifndef NNET_ACTIVATION_STREAM_H_
+#define NNET_ACTIVATION_STREAM_H_
+
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_stream.h"
+#include "nnet_types.h"
+#include <cmath>
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+#include <hls/streaming.hpp>
+
+namespace nnet {
+
+// *************************************************
+//       LINEAR Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+LinearActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        ////PRAGMA_DATA_PACK(out_data)
+
+    LinearPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = in_data[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+ReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        ////PRAGMA_DATA_PACK(out_data)
+
+    ReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void sigmoid(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+SigmoidActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        ////PRAGMA_DATA_PACK(out_data)
+
+    SigmoidPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = sigmoid_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+
+    // Calculate all the e^x's
+    #pragma HLS memory partition variable(exp_res) type(complete)
+    typename CONFIG_T::exp_table_t exp_res[data_T::size];
+    typename CONFIG_T::exp_table_t exp_sum(0);
+SoftmaxExpLoop:
+    #pragma HLS loop pipeline II(ii)
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+
+        data_T in_pack = data.read();
+    SoftmaxExpPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(in_pack[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+        // PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < res_T::size; j++) {
+            //#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<typename data_T::value_type, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(data_T::size, CONFIG_T::reuse_factor);
+    constexpr unsigned ii = data_T::size / multiplier_limit;
+
+    #pragma HLS memory partition variable(data_array) type(complete)
+    typename data_T::value_type data_array[data_T::size];
+SoftmaxArrayLoop:
+    #pragma HLS loop pipeline II(ii)
+    for (unsigned i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+
+        data_T in_pack = data.read();
+    SoftmaxArrayPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
+            data_array[j] = in_pack[j];
+        }
+
+        // Find the max and compute all delta(x_i, x_max)
+        Op_max<typename data_T::value_type> op_max;
+        typename data_T::value_type x_max =
+            reduce<typename data_T::value_type, data_T::size, Op_max<typename data_T::value_type>>(data_array, op_max);
+
+        // For the diffs, use the same type as the input but force rounding and saturation
+        hls::ap_fixpt<data_T::value_type::width, data_T::value_type::_I_W, hls::AP_RND, hls::AP_SAT> d_xi_xmax[data_T::size];
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
+            d_xi_xmax[j] = data_array[j] - x_max;
+        }
+
+        // Calculate all the e^x's
+        #pragma HLS memory partition variable(exp_res) type(complete)
+        typename CONFIG_T::exp_table_t exp_res[data_T::size];
+        typename CONFIG_T::exp_table_t exp_sum(0);
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
+            unsigned x = softmax_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[j]);
+            exp_res[j] = exp_table[x];
+        }
+
+        // Explicitly sum the results with an adder tree.
+        // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+        Op_add<typename CONFIG_T::exp_table_t> op_add;
+        exp_sum =
+            reduce<typename CONFIG_T::exp_table_t, data_T::size, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+        typename CONFIG_T::inv_table_t inv_exp_sum =
+            invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+
+        res_T out_pack;
+        // PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < res_T::size; j++) {
+            //#pragma HLS ALLOCATION operation instances=mul limit=multiplier_limit
+            out_pack[j] = exp_res[j] * inv_exp_sum;
+        }
+
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_legacy(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::table_t exp_res[data_T::size];
+    typename CONFIG_T::table_t exp_diff_res;
+    typename data_T::value_type data_cache[data_T::size];
+
+SoftmaxInitLoop:
+    #pragma HLS loop pipeline
+    for (unsigned s = 0; s < CONFIG_T::n_in / data_T::size; s++) {
+        data_T in_pack = data.read();
+    SoftmaxInitPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < data_T::size; j++) {
+            data_cache[j] = in_pack[j];
+            exp_res[j] = 0;
+        }
+
+    SoftmaxExpLoop:
+        #pragma HLS loop unroll
+        for (int i = 0; i < data_T::size; i++) {
+        SoftmaxExpInner:
+            #pragma HLS loop unroll
+            for (int j = 0; j < data_T::size; j++) {
+                if (i == j) {
+                    exp_diff_res = 1;
+                } else {
+                    int data_round = (data_cache[j] - data_cache[i]) * CONFIG_T::table_size / 16;
+                    int index = data_round + 8 * CONFIG_T::table_size / 16;
+                    if (index < 0)
+                        index = 0;
+                    if (index > CONFIG_T::table_size - 1)
+                        index = CONFIG_T::table_size - 1;
+                    exp_diff_res = exp_table[index];
+                }
+
+                exp_res[i] += exp_diff_res;
+            }
+        }
+
+        res_T out_pack;
+        // PRAGMA_DATA_PACK(out_pack)
+
+    SoftmaxInvPackLoop:
+        #pragma HLS loop unroll
+        for (unsigned j = 0; j < res_T::size; j++) {
+
+            int exp_res_index = exp_res[j] * CONFIG_T::table_size / 64;
+            if (exp_res_index < 0)
+                exp_res_index = 0;
+            if (exp_res_index > CONFIG_T::table_size - 1)
+                exp_res_index = CONFIG_T::table_size - 1;
+
+            out_pack[j] = (typename res_T::value_type)invert_table[exp_res_index];
+        }
+        res.write(out_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax_argmax(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+
+        #pragma HLS loop unroll
+        for (int i = 0; i < res_T::size; i++) {
+            out_data[i] = (typename res_T::value_type)0;
+        }
+
+        typename data_T::value_type maximum = in_data[0];
+        int idx = 0;
+
+        #pragma HLS loop pipeline
+        for (int i = 1; i < res_T::size; i++) {
+            if (in_data[i] > maximum) {
+                maximum = in_data[i];
+                idx = i;
+            }
+        }
+
+        out_data[idx] = (typename res_T::value_type)1;
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void softmax(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    assert(CONFIG_T::axis == -1);
+
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+TanHActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        ////PRAGMA_DATA_PACK(out_data)
+
+    TanHPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            int data_round = in_data[j] * CONFIG_T::table_size / 8;
+            int index = data_round + 4 * CONFIG_T::table_size / 8;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = tanh_table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       UnaryLUT Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void unary_lut(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res, typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
+    //#pragma HLS function_instantiate variable=table
+    #pragma HLS memory partition argument(table) type(complete)
+
+UnaryLUTActLoop:
+    #pragma HLS loop pipeline II(CONFIG_T::reuse_factor)// rewind
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    UnaryLUTPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            unsigned index = get_index_unary_lut<CONFIG_T::table_size>(in_data[j].V);
+            out_data[j] = table[index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_sigmoid(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+
+HardSigmoidActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    HardSigmoidPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            auto datareg = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (datareg > 1)
+                datareg = 1;
+            else if (datareg < 0)
+                datareg = 0;
+            out_data[j] = datareg;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void hard_tanh(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+
+HardSigmoidActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    HardSigmoidPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            auto sigmoid = CONFIG_T::slope * in_data[j] + CONFIG_T::shift;
+            if (sigmoid > 1)
+                sigmoid = 1;
+            else if (sigmoid < 0)
+                sigmoid = 0;
+            out_data[j] = 2 * sigmoid - 1;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void leaky_relu(hls::FIFO<data_T> &data, param_T alpha, hls::FIFO<res_T> &res) {
+LeakyReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    LeakyReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void thresholded_relu(hls::FIFO<data_T> &data, param_T theta, hls::FIFO<res_T> &res) {
+ThresholdedReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    ThresholdedReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > theta)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void softplus(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+SoftplusActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    SoftplusPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softplus_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void softsign(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+SoftsignActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    SoftsignPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            int data_round = in_data[j] * CONFIG_T::table_size / 16;
+            int index = data_round + 8 * CONFIG_T::table_size / 16;
+            if (index < 0)
+                index = 0;
+            else if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            out_data[j] = softsign_table[index];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void elu(hls::FIFO<data_T> &data, param_T alpha, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+EluActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    EluPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = datareg;
+            } else {
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = alpha * elu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    elu<data_T, hls::ap_uint<1>, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+SeluActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    SeluPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+
+            typename data_T::value_type datareg = in_data[j];
+            if (datareg >= 0) {
+                out_data[j] = (typename data_T::value_type)1.0507009873554804934193349852946 * datareg;
+            } else {
+                int index = datareg * CONFIG_T::table_size / -8;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                out_data[j] = selu_table[index];
+            }
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+
+template <class data_T, class param_T, class res_T, typename CONFIG_T>
+void prelu(hls::FIFO<data_T> &data, const param_T alpha[CONFIG_T::n_in], hls::FIFO<res_T> &res) {
+PReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = in_data[j];
+            else
+                out_data[j] = alpha[i * res_T::size + j] * in_data[j];
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void binary_tanh(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+PReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > 0)
+                out_data[j] = (typename res_T::value_type)1;
+            else
+                out_data[j] = (typename res_T::value_type) - 1;
+        }
+        res.write(out_data);
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void ternary_tanh(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+PReLUActLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::n_in / res_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+
+    PReLUPackLoop:
+        #pragma HLS loop unroll
+        for (int j = 0; j < res_T::size; j++) {
+            if (in_data[j] > 1)
+                out_data[j] = (typename res_T::value_type)1;
+            else if (in_data[j] <= -1)
+                out_data[j] = (typename res_T::value_type) - 1;
+            else
+                out_data[j] = (typename res_T::value_type)0;
+        }
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h b/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h
new file mode 100644
index 0000000000..30953da7bf
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_code_gen.h
@@ -0,0 +1,28 @@
+#ifndef NNET_INSTR_GEN_H_
+#define NNET_INSTR_GEN_H_
+
+#include "nnet_conv1d_latency.h"
+#include "nnet_helpers.h"
+
+#include "nnet_common.h"
+#include "nnet_function_stubs.h"
+#include "nnet_mult.h"
+#include <hls/streaming.hpp>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T> class PointwiseConv1D {
+  public:
+    static void pointwise_conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                               res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
+// hls4ml insert code
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_common.h b/hls4ml/templates/libero/nnet_utils/nnet_common.h
new file mode 100644
index 0000000000..64e3482ea7
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_common.h
@@ -0,0 +1,65 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "nnet_helpers.h"
+#include <hls/ap_fixpt.hpp>
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h
new file mode 100644
index 0000000000..6caa6166f3
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_conv1d_latency.h
@@ -0,0 +1,167 @@
+#ifndef NNET_CONV1D_LATENCY_H_
+#define NNET_CONV1D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                        res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    //#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+    //#pragma HLS ARRAY_PARTITION variable=weights complete
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    #pragma HLS loop pipeline II(CONFIG_T::reuse_factor)// rewind
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        #pragma HLS loop unroll
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            #pragma HLS loop unroll
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                #pragma HLS loop unroll
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            #pragma HLS loop unroll
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            #pragma HLS loop unroll
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+            Accum2:
+                #pragma HLS loop unroll
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            #pragma HLS loop unroll
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                res[i_part * CONFIG_T::n_pixels * mult_n_out + i_pxl * mult_n_out + i_res] =
+                    cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+                                  typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                                  typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+
+    //#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // Parallel mode
+    //#pragma HLS ARRAY_PARTITION variable=weights complete dim=0
+    //#pragma HLS ARRAY_PARTITION variable=biases complete dim=0
+
+    // Limit multipliers to control parallelization
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+// Convolve, saving all multiplication results to accumulate later
+ConvOut:
+    #pragma HLS loop pipeline II(CONFIG_T::reuse_factor)
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    ConvFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        ConvChan:
+            #pragma HLS loop unroll
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                int index_weight = cc * CONFIG_T::n_filt + ff;
+                int index_data = (ii * CONFIG_T::stride_width - CONFIG_T::pad_left) * CONFIG_T::n_chan + cc;
+
+                if ((ii * CONFIG_T::stride_width) < CONFIG_T::pad_left ||
+                    (ii * CONFIG_T::stride_width) >= (CONFIG_T::pad_left + CONFIG_T::in_width)) {
+                    mult[index_mult] = 0;
+                } else {
+                    mult[index_mult] = CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::weight_t>::product(
+                        data[index_data], weights[index_weight]);
+                }
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Initialize accumulator with input biases
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        #pragma HLS loop unroll
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            acc[ii][ff] = biases[ff];
+        }
+    }
+
+// Accumulate multiplication result
+AccumOut:
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    AccumFilt:
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Do "dot product" sum within filter and sum over channels
+        AccumChan:
+            for (int cc = 0; cc < CONFIG_T::n_chan; cc++) {
+                int index_mult = ii * CONFIG_T::n_filt * CONFIG_T::n_chan + ff * CONFIG_T::n_chan + cc;
+                acc[ii][ff] += mult[index_mult];
+            } // end channel loop
+        }     // end filter loop
+    }         // end output loop
+
+    // Cast to "res_t" type
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+        #pragma HLS loop unroll
+        for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+            res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense.h b/hls4ml/templates/libero/nnet_utils/nnet_dense.h
new file mode 100644
index 0000000000..0548dab794
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_dense.h
@@ -0,0 +1,82 @@
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_function_stubs.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <hls/streaming.hpp>
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+
+    template <class data_T, class res_T, class CONFIG_T> using kernel = nnet::DenseKernel<data_T, res_T, CONFIG_T>;
+
+    // Partitioning arrays cyclically to go with roll factors?
+
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS function inline
+    CONFIG_T::template kernel<data_T, res_T, CONFIG_T>::dense(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> class DenseLatency : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS function inline
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+class DenseResource_rf_leq_nin : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS function inline
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+class DenseResource_rf_gt_nin_rem0 : public DenseKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        #pragma HLS function inline
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h
new file mode 100644
index 0000000000..de8743d712
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_compressed.h
@@ -0,0 +1,89 @@
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <hls/streaming.hpp>
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    #pragma HLS loop unroll
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS memory partition argument(biases) type(complete)
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    #pragma HLS memory partition variable(acc) type(complete)
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+
+    //#ifdef __VITIS_HLS__
+    //    #pragma HLS AGGREGATE variable=weights
+    //#else
+    //    #pragma HLS data_pack variable=weights struct_level
+    //#endif
+
+InitAccum:
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+
+        #pragma HLS memory partition variable(mult) type(complete)
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+
+    ResetMult:
+        #pragma HLS loop unroll
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        #pragma HLS loop unroll
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    #pragma HLS loop unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h
new file mode 100644
index 0000000000..a241857450
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_latency.h
@@ -0,0 +1,72 @@
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <hls/streaming.hpp>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS function pipeline II(CONFIG_T::reuse_factor)
+    #pragma HLS memory partition argument(biases) type(complete)
+    data_T cache;
+    #pragma HLS memory partition variable(mult) type(complete)
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+    #pragma HLS memory partition variable(acc) type(complete)
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product1:
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+        cache = data[ii];
+    Product2:
+        for (uint jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+        }
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (uint iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (uint ii = 0; ii < CONFIG_T::n_in; ii++) {
+    Accum2:
+        for (uint jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (uint ires = 0; ires < CONFIG_T::n_out; ires++) {
+        // res[ires] = (res_T) (acc[ires]);
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h
new file mode 100644
index 0000000000..2c21e2f845
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_resource.h
@@ -0,0 +1,270 @@
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <hls/streaming.hpp>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS memory partition argument(biases) type(complete)
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+
+    // if (CONFIG_T::reuse_factor > 1) {
+    //     #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    // }
+
+    #pragma HLS memory partition variable(acc) type(complete)
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma HLS loop unroll
+    for (int iacc = 0; iacc < nout; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    #pragma HLS loop pipeline II(1)// rewind
+    for (int ir = 0; ir < rufactor; ir++) {
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        #pragma HLS loop unroll
+        for (int im = 0; im < block_factor; im++) {
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    #pragma HLS loop unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS memory partition argument(biases) type(complete)
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+
+    // if (CONFIG_T::reuse_factor > 1) {
+    //     #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    // }
+
+    #pragma HLS memory partition variable(acc) type(complete)
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma HLS loop unroll
+    for (int iacc = 0; iacc < nout; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    #pragma HLS loop pipeline II(1)// rewind
+    for (int ir = 0; ir < rufactor; ir++) {
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        #pragma HLS loop unroll
+        for (int im = 0; im < block_factor; im++) {
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    #pragma HLS loop unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS memory partition argument(biases) type(complete)
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+
+    // if (CONFIG_T::reuse_factor > 1) {
+    //     #pragma HLS RESOURCE variable=weights core=ROM_nP_BRAM
+    // }
+
+    #pragma HLS memory partition variable(acc) type(complete)
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma HLS loop unroll
+    for (int iacc = 0; iacc < nout; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    #pragma HLS loop pipeline II(1)// rewind
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS memory partition variable(tmpmult) type(complete)
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+
+    MultLoop:
+        #pragma HLS loop unroll
+        for (int im = 0; im < block_factor; im++) {
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        #pragma HLS memory partition variable(mult) type(complete)
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+
+    ResetMult:
+        #pragma HLS loop unroll
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        #pragma HLS loop unroll
+        for (int im = 0; im < block_factor; im++) {
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        #pragma HLS loop unroll
+        for (int im = 0; im < multiplier_limit; im++) {
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    #pragma HLS loop unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h
new file mode 100644
index 0000000000..194eb018ae
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_dense_stream.h
@@ -0,0 +1,105 @@
+#ifndef NNET_DENSE_STREAM_H_
+#define NNET_DENSE_STREAM_H_
+
+#include "nnet_common.h"
+#include "nnet_types.h"
+#include <assert.h>
+#include <hls/streaming.hpp>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS function pipeline II(CONFIG_T::reuse_factor)
+    dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_wrapper(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                            typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                            typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, typename CONFIG_T>
+void data_prepare(hls::FIFO<data_T> &data_stream, typename data_T::value_type data[CONFIG_T::n_in]) {
+    #pragma HLS function inline
+
+    if (CONFIG_T::n_in / data_T::size > 1) {
+    DataPrepare:
+        #pragma HLS loop pipeline
+        for (int i_in = 0; i_in < CONFIG_T::n_in / data_T::size; i_in++) {
+            data_T data_pack = data_stream.read();
+        DataPackPipeline:
+            #pragma HLS loop unroll
+            for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+                data[i_in * data_T::size + i_pack] = data_pack[i_pack];
+            }
+        }
+    } else {
+        data_T data_pack = data_stream.read();
+    DataPackSingle:
+        #pragma HLS loop unroll
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            data[i_pack] = data_pack[i_pack];
+        }
+    }
+}
+
+template <class res_T, typename CONFIG_T>
+void res_write(typename res_T::value_type res[CONFIG_T::n_out], hls::FIFO<res_T> &res_stream) {
+    #pragma HLS function inline
+
+    if (CONFIG_T::n_out / res_T::size > 1) {
+    ResWrite:
+        #pragma HLS loop pipeline
+        for (unsigned i_out = 0; i_out < CONFIG_T::n_out / res_T::size; i_out++) {
+            res_T res_pack;
+            // PRAGMA_DATA_PACK(res_pack)
+        ResPackPipeline:
+            #pragma HLS loop unroll
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                res_pack[i_pack] = res[i_out * res_T::size + i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+    } else {
+        res_T res_pack;
+        // PRAGMA_DATA_PACK(res_pack)
+    ResPackSingle:
+        #pragma HLS loop unroll
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            res_pack[i_pack] = res[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(hls::FIFO<data_T> &data_stream, hls::FIFO<res_T> &res_stream,
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS function inline
+
+    #pragma HLS memory partition variable(data) type(complete)
+    typename data_T::value_type data[CONFIG_T::n_in];
+
+    #pragma HLS memory partition variable(res) type(complete)
+    typename res_T::value_type res[CONFIG_T::n_out];
+
+    data_prepare<data_T, CONFIG_T>(data_stream, data);
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_wrapper<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(data, res, weights,
+                                                                                                  biases);
+    }
+    res_write<res_T, CONFIG_T>(res, res_stream);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h b/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h
new file mode 100644
index 0000000000..c42b28a463
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_function_stubs.h
@@ -0,0 +1,51 @@
+#ifndef NNET_FUNCTION_STUBS_H_
+#define NNET_FUNCTION_STUBS_H_
+
+#include "nnet_helpers.h"
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <hls/streaming.hpp>
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T> class FillConv1DBuffer {
+  public:
+    static void fill_buffer(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                            data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_width * CONFIG_T::n_chan],
+                            const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, typename CONFIG_T> class FillConv2DBuffer {
+  public:
+    static void
+    fill_buffer(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                data_T buffer[CONFIG_T::n_pixels][CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan],
+                const unsigned partition) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class DenseKernel {
+  public:
+    static void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+        // To be implemented in subclasses
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class Conv1DKernel {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+        // To be implemented in subclasses
+    }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_helpers.h b/hls4ml/templates/libero/nnet_utils/nnet_helpers.h
new file mode 100644
index 0000000000..a37b1243c7
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_helpers.h
@@ -0,0 +1,279 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <fstream>
+#include <hls/streaming.hpp>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR "firmware/weights"
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            w[i] = T(std::stof(token.c_str()));
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].sign >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::FIFO<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(hls::FIFO<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j]);
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = save_T(data[i]);
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(hls::FIFO<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << float(data[i]) << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(hls::FIFO<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << float(ctype[j]) << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, hls::FIFO<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (uint i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(hls::FIFO<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep)
+            result.write(res_pack);
+    }
+    out << std::endl;
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(hls::FIFO<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_mult.h b/hls4ml/templates/libero/nnet_utils/nnet_mult.h
new file mode 100644
index 0000000000..02418b4618
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_mult.h
@@ -0,0 +1,118 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <hls/streaming.hpp>
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        #pragma HLS function inline
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        #pragma HLS function inline
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        #pragma HLS function inline
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        #pragma HLS function inline
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        #pragma HLS function inline
+        return a * w;
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = hls::ap_fixpt<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        #pragma HLS function inline
+
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, hls::ap_uint<1>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, hls::ap_uint<1>>::value,
+                               hls::ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (hls::ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, hls::ap_uint<1>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, hls::ap_uint<1>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, hls::ap_uint<1>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_stream.h b/hls4ml/templates/libero/nnet_utils/nnet_stream.h
new file mode 100644
index 0000000000..d32fbb5f7f
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_stream.h
@@ -0,0 +1,223 @@
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include "nnet_common.h"
+#include <hls/streaming.hpp>
+
+namespace nnet {
+
+struct broadcast_config {
+    static const unsigned in_height = 1;
+    static const unsigned in_width = 1;
+    static const unsigned in_chan = 3;
+    static const unsigned out_height = 2;
+    static const unsigned out_width = 2;
+    static const unsigned out_chan = 3;
+};
+
+template <class data_T, class res_T, int N>
+void clone_stream(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res1, hls::FIFO<res_T> &res2) {
+CloneLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < N / data_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        // PRAGMA_DATA_PACK(out_data1)
+        // PRAGMA_DATA_PACK(out_data2)
+
+    ClonePack:
+        #pragma HLS loop unroll
+        for (int j = 0; j < data_T::size; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+    }
+}
+
+template <class data_T, class res_T, int N>
+void clone_stream(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res1, hls::FIFO<res_T> &res2, hls::FIFO<res_T> &res3) {
+CloneLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < N / data_T::size; i++) {
+
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        res_T out_data3;
+        // PRAGMA_DATA_PACK(out_data1)
+        // PRAGMA_DATA_PACK(out_data2)
+        // PRAGMA_DATA_PACK(out_data3)
+
+    ClonePack:
+        #pragma HLS loop unroll
+        for (int j = 0; j < data_T::size; j++) {
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+            out_data3[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+        res3.write(out_data3);
+    }
+}
+
+template <class data_T, class res_T, int N> void repack_stream(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    if (data_T::size == res_T::size) {
+        #pragma HLS loop pipeline
+        for (int i = 0; i < N / data_T::size; i++) {
+
+            data_T in_data = data.read();
+            res_T out_data;
+            // PRAGMA_DATA_PACK(out_data)
+
+            #pragma HLS loop unroll
+            for (int j = 0; j < data_T::size; j++) {
+                out_data[j] = in_data[j];
+            }
+
+            res.write(out_data);
+        }
+    } else if (data_T::size > res_T::size) {
+        constexpr unsigned pack_diff = data_T::size / res_T::size;
+        if (N / data_T::size > 1) {
+            #pragma HLS loop pipeline
+            for (int i = 0; i < N / data_T::size; i++) {
+
+                data_T in_data = data.read();
+                res_T out_data;
+                // PRAGMA_DATA_PACK(out_data)
+
+                #pragma HLS loop pipeline
+                for (int j = 0; j < pack_diff; j++) {
+
+                    res_T out_data;
+                    #pragma HLS loop unroll
+                    for (int k = 0; k < res_T::size; k++) {
+                        out_data[k] = in_data[j * res_T::size + k];
+                    }
+                    res.write(out_data);
+                }
+            }
+        } else {
+            for (int i = 0; i < N / data_T::size; i++) {
+
+                data_T in_data = data.read();
+                res_T out_data;
+                // PRAGMA_DATA_PACK(out_data)
+
+                #pragma HLS loop pipeline
+                for (int j = 0; j < pack_diff; j++) {
+
+                    res_T out_data;
+                    #pragma HLS loop unroll
+                    for (int k = 0; k < res_T::size; k++) {
+                        out_data[k] = in_data[j * res_T::size + k];
+                    }
+                    res.write(out_data);
+                }
+            }
+        }
+    } else { // data_T::size < res_T::size
+        res_T out_data;
+        constexpr unsigned pack_diff = res_T::size / data_T::size;
+        unsigned pack_cnt = 0;
+        #pragma HLS loop pipeline
+        for (int i = 0; i < N / data_T::size; i++) {
+
+            data_T in_data = data.read();
+            #pragma HLS loop unroll
+            for (int j = 0; j < data_T::size; j++) {
+                out_data[pack_cnt * data_T::size + j] = in_data[j];
+            }
+
+            if (pack_cnt == pack_diff - 1) {
+                res.write(out_data);
+                pack_cnt = 0;
+            } else {
+                pack_cnt++;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_1x1xC(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    assert(CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan);
+    int n_dupl = (CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::out_chan) /
+                 (CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan);
+BroadcastLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        data_T in_data = data.read();
+        #pragma HLS loop pipeline
+        for (int j = 0; j < n_dupl; j++) {
+            res_T out_data;
+            // PRAGMA_DATA_PACK(out_data)
+            #pragma HLS loop unroll
+            for (int k = 0; k < res_T::size; k++) {
+                out_data[k] = in_data[k];
+            }
+            res.write(out_data);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream_HxWx1(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    assert(CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+           CONFIG_T::in_width == CONFIG_T::out_width);
+BroadcastLoop:
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::in_chan / data_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+        #pragma HLS loop unroll
+        for (int k = 0; k < res_T::size; k++) {
+            out_data[k] = in_data[0];
+        }
+        res.write(out_data);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void broadcast_stream(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    if (CONFIG_T::in_height == 1 && CONFIG_T::in_width == 1 && CONFIG_T::in_chan == CONFIG_T::out_chan) {
+        broadcast_stream_1x1xC<data_T, res_T, CONFIG_T>(data, res);
+    } else if (CONFIG_T::in_chan == 1 && CONFIG_T::in_height == CONFIG_T::out_height &&
+               CONFIG_T::in_width == CONFIG_T::out_width) {
+        broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void transpose_2d(hls::FIFO<data_T> &data, hls::FIFO<res_T> &res) {
+    #pragma HLS memory partition variable(data_array) type(complete)
+    typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];
+
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
+        data_T in_data = data.read();
+        for (int j = 0; j < data_T::size; j++) {
+            data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    #pragma HLS loop pipeline
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
+        res_T out_data;
+        // PRAGMA_DATA_PACK(out_data)
+        for (int j = 0; j < res_T::size; j++) {
+            out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+        }
+        res.write(out_data);
+    }
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/libero/nnet_utils/nnet_types.h b/hls4ml/templates/libero/nnet_utils/nnet_types.h
new file mode 100644
index 0000000000..16737a6630
--- /dev/null
+++ b/hls4ml/templates/libero/nnet_utils/nnet_types.h
@@ -0,0 +1,66 @@
+#ifndef NNET_TYPES_H_
+#define NNET_TYPES_H_
+
+#include <assert.h>
+#include <cstddef>
+#include <cstdio>
+#include <hls/ap_fixpt.hpp>
+#include <hls/ap_int.hpp>
+
+namespace nnet {
+
+// Fixed-size array
+template <typename T, unsigned N> struct array {
+    typedef T value_type;
+    static const unsigned size = N;
+
+    T data[N];
+
+    T &operator[](size_t pos) { return data[pos]; }
+
+    const T &operator[](size_t pos) const { return data[pos]; }
+
+    array &operator=(const array &other) {
+        if (&other == this)
+            return *this;
+
+        assert(N == other.size && "Array sizes must match.");
+
+        #pragma HLS loop unroll
+        for (unsigned i = 0; i < N; i++) {
+            data[i] = other[i];
+        }
+        return *this;
+    }
+};
+
+// Generic lookup-table implementation, for use in approximations of math functions
+template <typename T, unsigned N, T (*func)(T)> class lookup_table {
+  public:
+    lookup_table(T from, T to) : range_start(from), range_end(to), base_div(hls::ap_uint<16>(N) / T(to - from)) {
+        T step = (range_end - range_start) / hls::ap_uint<16>(N);
+        for (size_t i = 0; i < N; i++) {
+            T num = range_start + hls::ap_uint<16>(i) * step;
+            T sample = func(num);
+            samples[i] = sample;
+        }
+    }
+
+    T operator()(T n) const {
+        int index = (n - range_start) * base_div;
+        if (index < 0)
+            index = 0;
+        else if (index > N - 1)
+            index = N - 1;
+        return samples[index];
+    }
+
+  private:
+    T samples[N];
+    const T range_start, range_end;
+    hls::ap_fixpt<20, 16> base_div;
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 8de19fe1d2..8c48f79d2d 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -1,4 +1,5 @@
 from hls4ml.writer.catapult_writer import CatapultWriter
+from hls4ml.writer.libero_writer import LiberoWriter
 from hls4ml.writer.oneapi_writer import OneAPIWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
@@ -13,4 +14,5 @@
 register_writer('Quartus', QuartusWriter)
 register_writer('oneAPI', OneAPIWriter)
 register_writer('Catapult', CatapultWriter)
+register_writer('Libero', LiberoWriter)
 register_writer('SymbolicExpression', SymbolicExpressionWriter)
diff --git a/hls4ml/writer/libero_writer.py b/hls4ml/writer/libero_writer.py
new file mode 100644
index 0000000000..7e6dfd03e9
--- /dev/null
+++ b/hls4ml/writer/libero_writer.py
@@ -0,0 +1,844 @@
+import glob
+import stat
+import tarfile
+from collections import OrderedDict
+from pathlib import Path
+from shutil import copyfile
+
+import numpy as np
+import yaml
+
+from hls4ml.writer.writers import Writer
+
+config_filename = 'hls4ml_config.yml'
+
+
+class LiberoWriter(Writer):
+    def print_array_to_cpp(self, var, odir, namespace=None, write_txt_file=True):
+        """Write a weights array to C++ header files.
+
+        Args:
+            var (WeightVariable): Weight to write
+            odir (str): Output directory
+            namespace (str, optional): Writes a namespace for the weights to avoid clashes with global variables.
+            write_txt_file (bool, optional): Write txt files in addition to .h files. Defaults to True.
+        """
+
+        h_file = open(f'{odir}/firmware/weights/{var.name}.h', 'w')
+        if write_txt_file:
+            txt_file = open(f'{odir}/firmware/weights/{var.name}.txt', 'w')
+
+        # meta data
+        h_file.write(f'//Numpy array shape {var.shape}\n')
+        h_file.write(f'//Min {np.min(var.min):.12f}\n')
+        h_file.write(f'//Max {np.max(var.max):.12f}\n')
+        h_file.write(f'//Number of zeros {var.nzeros}\n')
+        h_file.write('\n')
+
+        h_file.write(f'#ifndef {var.name.upper()}_H_\n')
+        h_file.write(f'#define {var.name.upper()}_H_\n')
+        h_file.write('\n')
+
+        if namespace is not None:
+            h_file.write(f'namespace {namespace} {{\n\n')
+
+        if write_txt_file:
+            h_file.write('#ifndef __SYNTHESIS__\n')
+            h_file.write(var.definition_cpp() + ';\n')
+            h_file.write('#else\n')
+
+        h_file.write(var.definition_cpp() + ' = {')
+
+        # fill c++ array.
+        # not including internal brackets for multidimensional case
+        sep = ''
+        for x in var:
+            h_file.write(sep + x)
+            if write_txt_file:
+                txt_file.write(sep + x)
+            sep = ', '
+        h_file.write('};\n\n')
+
+        if write_txt_file:
+            h_file.write('#endif\n')
+            txt_file.close()
+
+        if namespace is not None:
+            h_file.write('}\n\n')
+
+        h_file.write('\n#endif\n')
+        h_file.close()
+
+    def write_project_dir(self, model):
+        """Write the base project directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        out_path = Path(f'{model.config.get_output_dir()}/firmware/weights')
+        out_path.mkdir(parents=True, exist_ok=True)
+
+    @staticmethod
+    def _make_array_pragma(variable, is_argument=False):
+        """
+        Layers in ModelGraph can specify output array partitioning through the `pragma` attribute.
+        If `pragma` is a string: options are 'partition' or 'stream'.
+        If `pragma` is a tuple: (mode, type, factor) where mode is 'partition', type is
+        'complete', 'cyclic', or 'block', and factor is an integer only used when the type is not 'complete'.
+        """
+
+        config = variable.pragma
+        if type(config) is tuple:
+            mode = config[0]
+            if mode == 'partition':
+                typ = config[1]
+                if typ != 'complete':
+                    factor = config[2]
+            elif mode == 'stream':
+                depth = config[1]
+        else:
+            mode = config
+            typ = 'complete'
+            factor = 0
+
+        arg_name = 'argument' if is_argument else 'variable'
+
+        if mode == 'partition':
+            if typ == 'complete':
+                template = '#pragma HLS memory partition {arg_name}({name}) type({type}) dim({dim})'
+            else:
+                template = '#pragma HLS memory partition {arg_name}({name}) type({type}) factor({factor}) dim({dim})'
+
+            return template.format(mode=mode.upper(), name=variable.name, type=typ, factor=factor, dim=0, arg_name=arg_name)
+
+        elif mode == 'stream':
+            # TODO update for streaming IO
+            return f'#pragma HLS STREAM {arg_name}={variable.name} depth={depth}'
+
+    def write_project_cpp(self, model):
+        """Write the main architecture source file (myproject.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = Path(__file__).parent
+        prj_name = model.config.get_project_name()
+        prj_cpp_src = (filedir / '../templates/libero/firmware/myproject.cpp').resolve()
+        prj_cpp_dst = Path(f'{model.config.get_output_dir()}/firmware/{prj_name}.cpp').resolve()
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+        prj_name = prj_name
+
+        indent = '    '
+
+        with open(prj_cpp_src) as src, open(prj_cpp_dst, 'w') as dst:
+            for line in src.readlines():
+                # Add headers to weights and biases
+                if 'myproject' in line:
+                    newline = line.replace('myproject', prj_name)
+
+                elif '// hls-fpga-machine-learning insert header' in line:
+                    inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
+                    outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
+                    brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+                    newline = ''
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str
+                    if len(model_brams) > 0:
+                        newline += ',\n' + brams_str
+                    newline += '\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-start' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += f'namespace {namespace} {{\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-end' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += '}\n'
+
+                elif '// hls-fpga-machine-learning insert load weights' in line:
+                    newline = line
+                    if model.config.get_writer_config()['WriteWeightsTxt']:
+
+                        newline += '#ifndef __SYNTHESIS__\n'
+                        newline += '    static bool loaded_weights = false;\n'
+                        newline += '    if (!loaded_weights) {\n'
+
+                        for layer in model.get_layers():
+                            for w in layer.get_weights():
+                                if w.weight_class == 'CompressedWeightVariable':
+                                    newline += (
+                                        indent
+                                        + '    nnet::load_compressed_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                            w.type.name, w.nonzeros, w.name, w.name
+                                        )
+                                    )
+                                elif w.weight_class == 'ExponentWeightVariable':
+                                    newline += (
+                                        indent
+                                        + '    nnet::load_exponent_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                            w.type.name, w.data_length, w.name, w.name
+                                        )
+                                    )
+                                else:
+                                    newline += indent + '    nnet::load_weights_from_txt<{}, {}>({}, "{}.txt");\n'.format(
+                                        w.type.name, w.data_length, w.name, w.name
+                                    )
+
+                        newline += '        loaded_weights = true;\n'
+                        newline += '    }\n'
+                        newline += '#endif'
+
+                # Add input/output type
+                elif '// hls-fpga-machine-learning insert IO' in line:
+                    newline = ''
+                    all_inputs = [i.name for i in model_inputs]
+                    all_outputs = [o.name for o in model_outputs]
+                    all_brams = [b.name for b in model_brams]
+                    io_type = model.config.get_config_value('IOType')
+
+                    pipeline_style = model.config.pipeline_style
+                    pipeline_ii = model.config.pipeline_ii
+                    pipeline_pragma = indent + f'#pragma HLS function {pipeline_style}'
+                    if pipeline_style == 'pipeline' and pipeline_ii is not None:
+                        pipeline_pragma += f' II({pipeline_ii})\n'
+                    else:
+                        pipeline_pragma += '\n'
+
+                    if io_type == 'io_parallel':
+                        for i in model_inputs:
+                            newline += indent + self._make_array_pragma(i, is_argument=True) + '\n'
+                        for o in model_outputs:
+                            newline += indent + self._make_array_pragma(o, is_argument=True) + '\n'
+                        # TODO Expose interface in a backend config
+                        newline += indent + '#pragma HLS interface control type(simple)\n'
+                        for input_name in all_inputs:
+                            newline += indent + f'#pragma HLS interface argument({input_name}) type(simple)\n'
+                        for output_name in all_outputs:
+                            newline += indent + f'#pragma HLS interface argument({output_name}) type(simple)\n'
+                        newline += pipeline_pragma
+
+                    if io_type == 'io_stream':
+                        newline += indent + '#pragma HLS interface control type(axi_target)\n'
+                        newline += indent + '#pragma HLS interface default type(axi_target)'
+                        for bram_name in all_brams:
+                            newline += indent + f'#pragma HLS interface argument({bram_name}) dma(true)\n'
+                        newline += pipeline_pragma
+
+                elif '// hls-fpga-machine-learning insert layers' in line:
+                    newline = line + '\n'
+                    for layer in model.get_layers():
+                        vars = layer.get_variables()
+                        for var in vars:
+                            if var not in model_inputs and var not in model_outputs:
+                                def_cpp = var.definition_cpp()
+                                if def_cpp is not None:
+                                    if var.pragma:
+                                        newline += '    ' + self._make_array_pragma(var) + '\n'
+                                    newline += '    ' + def_cpp + ';\n'
+                        func = layer.get_attr('function_cpp', None)
+                        if func:
+                            if not isinstance(func, (list, set)):
+                                func = [func]
+                            if len(func) == 1:
+                                newline += '    ' + func[0] + ' // ' + layer.name + '\n'
+                            else:
+                                newline += '    // ' + layer.name + '\n'
+                                for line in func:
+                                    newline += '    ' + line + '\n'
+                            if model.config.trace_output and layer.get_attr('trace', False):
+                                newline += '#ifndef __SYNTHESIS__\n'
+                                for var in vars:
+                                    newline += '    nnet::save_layer_output<{}>({}, "{}", {});\n'.format(
+                                        var.type.name, var.name, layer.name, var.size_cpp()
+                                    )
+                                newline += '#endif\n'
+                            newline += '\n'
+
+                # Just copy line
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_project_header(self, model):
+        """Write the main architecture header file (myproject.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = Path(__file__).parent
+        prj_name = model.config.get_project_name()
+        prj_h_src = (filedir / '../templates/libero/firmware/myproject.h').resolve()
+        prj_h_dst = Path(f'{model.config.get_output_dir()}/firmware/{prj_name}.h').resolve()
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        with open(prj_h_src) as src, open(prj_h_dst, 'w') as dst:
+            for line in src.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', format(prj_name.upper()))
+
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', prj_name)
+
+                elif '// hls-fpga-machine-learning insert header' in line:
+                    inputs_str = ', '.join([i.definition_cpp(as_reference=True) for i in model_inputs])
+                    outputs_str = ', '.join([o.definition_cpp(as_reference=True) for o in model_outputs])
+                    brams_str = ', \n'.join([indent + b.definition_cpp(as_reference=False) for b in model_brams])
+
+                    newline = ''
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str
+                    if len(model_brams) > 0:
+                        newline += ',\n' + brams_str
+                    newline += '\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-start' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += f'namespace {namespace} {{\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-end' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += '}\n'
+
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_defines(self, model):
+        """Write the C++ type definitions file (defines.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = Path(__file__).parent
+        defines_src = (filedir / '../templates/libero/firmware/defines.h').resolve()
+        defines_dst = Path(f'{model.config.get_output_dir()}/firmware/defines.h').resolve()
+
+        with open(defines_src) as src, open(defines_dst, 'w') as dst:
+            for line in src.readlines():
+                # Insert numbers
+                if '// hls-fpga-machine-learning insert numbers' in line:
+                    newline = line
+
+                    defines_list = []
+                    for layer in model.get_layers():
+                        defines = ''
+                        for k, v in layer.get_output_variable().get_shape():
+                            defines += f'#define {k} {v}\n'
+
+                        defines_list.append(defines)
+
+                    newline += ''.join(defines_list)
+
+                elif '// hls-fpga-machine-learning insert layer-precision' in line:
+                    newline = line
+                    all_precision = OrderedDict()
+                    for layer in model.get_layers():
+                        layer_precision = layer.get_layer_precision()
+                        for type_name, type_var in layer_precision.items():
+                            # Ensure that layer's types doesn't override existing types
+                            # This can happen in case of InplaceVariable types
+                            if type_name not in all_precision:
+                                all_precision[type_name] = type_var
+                    for used_type in all_precision.values():
+                        newline += used_type.definition_cpp()
+
+                elif '// hls-fpga-machine-learning insert namespace-start' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += f'namespace {namespace} {{\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-end' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += '}\n'
+
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_parameters(self, model):
+        """Write the C++ layer config file (parameters.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        filedir = Path(__file__).parent
+        params_src = (filedir / '../templates/libero/firmware/parameters.h').resolve()
+        params_dst = Path(f'{model.config.get_output_dir()}/firmware/parameters.h').resolve()
+
+        with open(params_src) as src, open(params_dst, 'w') as dst:
+            for line in src.readlines():
+                if '// hls-fpga-machine-learning insert includes' in line:
+                    newline = line
+                    for include in sorted(
+                        set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))
+                    ):
+                        newline += '#include "%s"\n' % include
+
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            if w.storage.lower() != 'bram':
+                                newline += f'#include "weights/{w.name}.h"\n'
+
+                elif "// hls-fpga-machine-learning insert layer-config" in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        config = layer.get_attr('config_cpp', None)
+                        if config:
+                            newline += '// ' + layer.name + '\n'
+                            newline += config + '\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-start' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += f'namespace {namespace} {{\n'
+
+                elif '// hls-fpga-machine-learning insert namespace-end' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += '}\n'
+
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_weights(self, model):
+        """Write the weights into header files
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        namespace = model.config.get_writer_config().get('Namespace', None)
+        write_txt = model.config.get_writer_config().get('WriteWeightsTxt', True)
+        for layer in model.get_layers():
+            for weights in layer.get_weights():
+                self.print_array_to_cpp(
+                    weights, model.config.get_output_dir(), namespace=namespace, write_txt_file=write_txt
+                )
+
+    def __make_dat_file(self, original_path, project_path):
+        """
+        Convert other input/output data types into a dat file, which is
+        a text file with the falttened matrix printed out. Note that ' ' is
+        assumed to be the delimiter.
+        """
+
+        # Take in data from current supported data files
+        if original_path[-3:] == "npy":
+            data = np.load(original_path)
+        else:
+            raise Exception("Unsupported input/output data files.")
+
+        # Faltten data, just keep first dimension
+        data = data.reshape(data.shape[0], -1)
+
+        def print_data(f):
+            for i in range(data.shape[0]):
+                for j in range(data.shape[1]):
+                    f.write(str(data[i][j]) + " ")
+                f.write("\n")
+
+        # Print out in dat file
+        with open(project_path, "w") as f:
+            print_data(f)
+
+    def write_test_bench(self, model):
+        """Write the testbench files (myproject_test.cpp and input/output .dat files)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = Path(__file__).parent
+        prj_name = model.config.get_project_name()
+        out_dir = model.config.get_output_dir()
+
+        tb_data_dir = Path(f'{out_dir}/tb_data/').resolve()
+        tb_data_dir.mkdir(parents=True, exist_ok=True)
+
+        input_data = model.config.get_config_value('InputData')
+        output_predictions = model.config.get_config_value('OutputPredictions')
+
+        if input_data:
+            if input_data[-3:] == 'dat':
+                copyfile(input_data, f'{out_dir}/tb_data/tb_input_features.dat')
+            else:
+                self.__make_dat_file(input_data, f'{out_dir}/tb_data/tb_input_features.dat')
+
+        if output_predictions:
+            if output_predictions[-3:] == 'dat':
+                copyfile(output_predictions, f'{out_dir}/tb_data/tb_output_predictions.dat')
+            else:
+                self.__make_dat_file(output_predictions, f'{out_dir}/tb_data/tb_output_predictions.dat')
+
+        tb_src = (filedir / '../templates/libero/myproject_test.cpp').resolve()
+        tb_dst = Path(f'{out_dir}/{prj_name}_test.cpp').resolve()
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        with open(tb_src) as src, open(tb_dst, 'w') as dst:
+            for line in src.readlines():
+                indent = ' ' * (len(line) - len(line.lstrip(' ')))
+
+                # Insert numbers
+                if 'myproject' in line:
+                    newline = line.replace('myproject', model.config.get_project_name())
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+                elif '// hls-fpga-machine-learning insert data' in line:
+                    newline = line
+                    offset = 0
+                    for inp in model_inputs:
+                        newline += '      ' + inp.definition_cpp() + ';\n'
+                        newline += '      nnet::copy_data<float, {}, {}, {}>(in, {});\n'.format(
+                            inp.type.name, offset, inp.size_cpp(), inp.name
+                        )
+                        offset += inp.size()
+                    for out in model_outputs:
+                        newline += '      ' + out.definition_cpp() + ';\n'
+
+                elif '// hls-fpga-machine-learning insert zero' in line:
+                    newline = line
+                    for inp in model_inputs:
+                        newline += indent + inp.definition_cpp() + ';\n'
+                        newline += indent + f'nnet::fill_zero<{inp.type.name}, {inp.size_cpp()}>({inp.name});\n'
+                    for out in model_outputs:
+                        newline += indent + out.definition_cpp() + ';\n'
+
+                elif '// hls-fpga-machine-learning insert top-level-function' in line:
+                    newline = line
+
+                    input_vars = ','.join([i.name for i in model_inputs])
+                    output_vars = ','.join([o.name for o in model_outputs])
+                    bram_vars = ','.join([b.name for b in model_brams])
+
+                    # Concatenate the input, output, and bram variables. Filter out empty/null values
+                    all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                    top_level = indent + f'{model.config.get_project_name()}({all_vars});\n'
+
+                    newline += top_level
+
+                elif '// hls-fpga-machine-learning insert predictions' in line:
+                    newline = line
+                    for out in model_outputs:
+                        newline += indent + f'for(int i = 0; i < {out.size_cpp()}; i++) {{\n'
+                        newline += indent + '  std::cout << pr[i] << " ";\n'
+                        newline += indent + '}\n'
+                        newline += indent + 'std::cout << std::endl;\n'
+
+                elif '// hls-fpga-machine-learning insert tb-output' in line:
+                    newline = line
+                    for out in model_outputs:
+                        newline += indent + 'nnet::print_result<{}, {}>({}, fout);\n'.format(
+                            out.type.name, out.size_cpp(), out.name
+                        )  # TODO enable this
+
+                elif (
+                    '// hls-fpga-machine-learning insert output' in line
+                    or '// hls-fpga-machine-learning insert quantized' in line
+                ):
+                    newline = line
+                    for out in model_outputs:
+                        newline += indent + 'nnet::print_result<{}, {}>({}, std::cout, true);\n'.format(
+                            out.type.name, out.size_cpp(), out.name
+                        )
+
+                elif '// hls-fpga-machine-learning insert namespace' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += indent + f'using namespace {namespace};\n'
+
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_bridge(self, model):
+        """Write the Python-C++ bridge (myproject_bridge.cpp)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = Path(__file__).parent
+        prj_name = model.config.get_project_name()
+        bridge_src = (filedir / '../templates/libero/myproject_bridge.cpp').resolve()
+        bridge_dst = Path(f'{model.config.get_output_dir()}/{prj_name}_bridge.cpp').resolve()
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        with open(bridge_src) as src, open(bridge_dst, 'w') as dst:
+            for line in src.readlines():
+                if 'MYPROJECT' in line:
+                    newline = line.replace('MYPROJECT', prj_name.upper())
+
+                elif 'myproject' in line:
+                    newline = line.replace('myproject', prj_name)
+
+                elif '// hls-fpga-machine-learning insert bram' in line:
+                    newline = line
+                    for bram in model_brams:
+                        newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+                elif '// hls-fpga-machine-learning insert header' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    inputs_str = ', '.join([f'{dtype} {i.name}[{i.size_cpp()}]' for i in model_inputs])
+                    outputs_str = ', '.join([f'{dtype} {o.name}[{o.size_cpp()}]' for o in model_outputs])
+
+                    newline = ''
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str + '\n'
+
+                elif '// hls-fpga-machine-learning insert wrapper' in line:
+                    dtype = line.split('#', 1)[1].strip()
+                    newline = ''
+                    for i in model_inputs:
+                        def_cpp = i.definition_cpp(name_suffix='_ap')
+                        vname = i.name
+                        tname = i.type.name
+                        size = i.size_cpp()
+                        newline += indent + f'{def_cpp};\n'
+                        newline += indent + f'nnet::convert_data<{dtype}, {tname}, {size}>({vname}, {vname}_ap);\n'
+                    newline += '\n'
+
+                    for o in model_outputs:
+                        def_cpp = o.definition_cpp(name_suffix='_ap')
+                        newline += indent + f'{def_cpp};\n'
+
+                    newline += '\n'
+
+                    input_vars = ','.join([i.name + '_ap' for i in model_inputs])
+                    bram_vars = ','.join([b.name for b in model_brams])
+                    output_vars = ','.join([o.name + '_ap' for o in model_outputs])
+
+                    # Concatenate the input, output, and bram variables. Filter out empty/null values
+                    all_vars = ','.join(filter(None, [input_vars, output_vars, bram_vars]))
+
+                    top_level = indent + f'{prj_name}({all_vars});\n'
+                    newline += top_level
+
+                    newline += '\n'
+
+                    for o in model_outputs:
+                        vname = o.name
+                        tname = o.type.name
+                        size = o.size_cpp()
+                        newline += indent + f'nnet::convert_data<{tname}, {dtype}, {size}>({vname}_ap, {vname});\n'
+
+                elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                    newline = ''
+                    for layer in model.get_layers():
+                        func = layer.get_attr('function_cpp', None)
+                        if func and model.config.trace_output and layer.get_attr('trace', False):
+                            vars = layer.get_variables()
+                            for var in vars:
+                                newline += (
+                                    indent
+                                    + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                    + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                                )
+
+                elif '// hls-fpga-machine-learning insert namespace' in line:
+                    newline = ''
+
+                    namespace = model.config.get_writer_config().get('Namespace', None)
+                    if namespace is not None:
+                        newline += indent + f'using namespace {namespace};\n'
+
+                else:
+                    newline = line
+                dst.write(newline)
+
+    def write_build_script(self, model):
+        """Write the TCL/Shell build scripts (config.tcl, Makefile, build_lib.sh)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        filedir = Path(__file__).parent
+        prj_name = model.config.get_project_name()
+
+        # project.tcl
+        cfg_tcl_dst = Path(f'{model.config.get_output_dir()}/config.tcl')
+        with open(cfg_tcl_dst, 'w') as f:
+            f.write('source $env(SHLS_ROOT_DIR)/examples/legup.tcl\n')
+            fpga_family = model.config.get_config_value('FPGAFamily')
+            fpga_part = model.config.get_config_value('Part')
+            board = model.config.get_config_value('Board')
+            clock = model.config.get_config_value('ClockPeriod')
+            f.write(f'set_project {fpga_family} {fpga_part} {board}\n')
+            f.write(f'set_parameter CLOCK_PERIOD {clock}\n')
+
+        # Makefile
+        makefile_dst = Path(f'{model.config.get_output_dir()}/Makefile')
+        with open(makefile_dst, 'w') as f:
+            f.write(f'NAME = {prj_name}\n')
+            f.write('LOCAL_CONFIG = -legup-config=config.tcl\n')
+            f.write(f'SRCS = firmware/{prj_name}.cpp {prj_name}_test.cpp \n')
+            # Not sure if this is required, it is present in both GUI- and CLI-generated projects
+            f.write('LEVEL = $(SHLS_ROOT_DIR)/examples\n')
+            # This must be the last line
+            f.write('include $(LEVEL)/Makefile.common\n')
+
+        # build_lib.sh
+        build_lib_src = (filedir / '../templates/libero/build_lib.sh').resolve()
+        build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve()
+        with open(build_lib_src) as src, open(build_lib_dst, 'w') as dst:
+            for line in src.readlines():
+                line = line.replace('myproject', prj_name)
+                line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+                dst.write(line)
+        build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC)
+
+    def write_nnet_utils(self, model):
+        """Copy the nnet_utils, AP types headers and any custom source to the project output directory
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        # nnet_utils
+        filedir = Path(__file__).parent
+        out_dir = model.config.get_output_dir()
+
+        srcpath = (filedir / '../templates/libero/nnet_utils/').resolve()
+        dstpath = Path(f'{out_dir}/firmware/nnet_utils/').resolve()
+        dstpath.mkdir(parents=True, exist_ok=True)
+
+        headers = [Path(h).name for h in glob.glob(str(srcpath / '*.h'))]
+
+        for h in headers:
+            copyfile(srcpath / h, dstpath / h)
+
+        # custom source
+        custom_source = model.config.backend.get_custom_source()
+        for dst, srcpath in custom_source.items():
+            dstpath = Path(f'{out_dir}/firmware/{dst}')
+            copyfile(srcpath, dstpath)
+
+    def write_generated_code(self, model):
+        """Write the generated code (nnet_code_gen.h)
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+        codegen_path = Path(f'{model.config.get_output_dir()}/firmware/nnet_utils/nnet_code_gen.h')
+        with open(codegen_path) as src:
+            contents = src.readlines()
+        with open(codegen_path, 'w') as dst:
+            namespace = model.config.get_writer_config().get('Namespace', None)
+
+            for line in contents:
+                if '// hls4ml insert code' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for generated_code in layer.code.values():
+                            newline += str(generated_code)
+                else:
+                    newline = line
+                if namespace is not None:
+                    if 'namespace nnet' in newline:
+                        newline = newline.replace('namespace nnet', f'namespace {namespace}')
+                dst.write(newline)
+
+    def write_yml(self, model):
+        """Write the config to the YAML file
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        def keras_model_representer(dumper, keras_model):
+            model_path = model.config.get_output_dir() + '/keras_model.keras'
+            keras_model.save(model_path)
+            return dumper.represent_scalar('!keras_model', model_path)
+
+        try:
+            import keras
+
+            KerasModel = keras.models.Model
+
+            yaml.add_multi_representer(KerasModel, keras_model_representer)
+        except Exception:
+            pass
+
+        with open(model.config.get_output_dir() + '/' + config_filename, 'w') as file:
+            yaml.dump(model.config.config, file)
+
+    def write_tar(self, model):
+        """Write the generated project as a .tar.gz archive
+
+        Args:
+            model (ModelGraph): the hls4ml model.
+        """
+
+        write_tar = model.config.get_writer_config().get('WriteTar', False)
+        if write_tar:
+            tar_path = Path(model.config.get_output_dir() + '.tar.gz')
+            tar_path.unlink(missing_ok=True)
+            with tarfile.open(tar_path, mode='w:gz') as archive:
+                archive.add(model.config.get_output_dir(), recursive=True, arcname='')
+
+    def write_hls(self, model):
+        print('Writing HLS project')
+        self.write_project_dir(model)
+        self.write_project_cpp(model)
+        self.write_project_header(model)
+        self.write_weights(model)
+        self.write_defines(model)
+        self.write_parameters(model)
+        self.write_test_bench(model)
+        self.write_bridge(model)
+        self.write_build_script(model)
+        self.write_nnet_utils(model)
+        self.write_generated_code(model)
+        self.write_yml(model)
+        self.write_tar(model)
+        print('Done')